diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 660c62884be..00000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,18 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve
-title: ''
-labels: bug
-assignees: ''
-
----
-
-<!--
-    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
-
-    If you have a question about using Kaldi, please use the kald-help discussion group:
-
-    https://groups.google.com/forum/#!forum/kaldi-help
-
-    Instructions for joining are available at: http://kaldi-asr.org/forums.html
--->
diff --git a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
deleted file mode 100644
index 61e797b9ca1..00000000000
--- a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
+++ /dev/null
@@ -1,18 +0,0 @@
----
-name: Feature proposal or discussion
-about: Suggest an idea for Kaldi
-title: ''
-labels: discussion
-assignees: ''
-
----
-
-<!--
-    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
-
-    If you have a question about using Kaldi, please use the kald-help discussion group:
-
-    https://groups.google.com/forum/#!forum/kaldi-help
-
-    Instructions for joining are available at: http://kaldi-asr.org/forums.html
--->
diff --git a/.github/ISSUE_TEMPLATE/kaldi10-issue.md b/.github/ISSUE_TEMPLATE/kaldi10-issue.md
deleted file mode 100644
index 5f2d11d8a0a..00000000000
--- a/.github/ISSUE_TEMPLATE/kaldi10-issue.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-name: Kaldi10 issue
-about: This option is for use by core developers only
-title: ''
-labels: kaldi10-TODO
-assignees: ''
-
----
-
diff --git a/.github/stale.yml b/.github/stale.yml
deleted file mode 100644
index a689635d211..00000000000
--- a/.github/stale.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Number of days of inactivity before an issue becomes stale.
-daysUntilStale: 60
-# Number of days of inactivity before a stale issue is closed.
-# TODO(kkm): Re-enable auto-closing when done with the current heap of old PRs.
-daysUntilClose: false
-# Issues with these labels will never be considered stale.
-exemptLabels:
-  - discussion
-  - enhancement
-  - help-wanted
-  - in progress
-  - low-priority
-  - newbie
-  - stale-exclude
-  - stopped development
-# Label to use when marking an issue as stale.
-staleLabel: stale
-# Comment to post when marking an issue as stale.
-markComment: >
-  This issue has been automatically marked as stale by a bot solely because it
-  has not had recent activity. Please add any comment (simply 'ping' is enough)
-  to prevent the issue from being closed for 60 more days if you believe it
-  should be kept open.
-# Comment to post when closing a stale issue.
-closeComment: >
-  This issue has been automatically closed by a bot strictly because of
-  inactivity. This does not mean that we think that this issue is not
-  important! If you believe it has been closed hastily, add a comment
-  to the issue and mention @kkm000, and I'll gladly reopen it.
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
deleted file mode 100644
index 1331f8e11ce..00000000000
--- a/.github/workflows/c-cpp.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: C/C++ CI
-
-on:
-  push:
-    branches: [ "master" ]
-  pull_request:
-    branches: [ "master" ]
-
-jobs:
-  build:
-
-    runs-on: ubuntu-latest
-    env:
-      CCACHE_DIR: /home/runner/work/kaldi/kaldi/.ccache
-      CXX: "ccache g++"
-      CC: "ccache gcc"
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Install sox
-      run: sudo apt-get install -y sox intel-mkl
-    - name: ccache
-      uses: hendrikmuhs/ccache-action@v1.2
-      with:
-        verbose: 1
-        max-size: 3G
-    - name: make tools
-      run: cd tools && make -j3
-    - name: ccache stats
-      run: ccache -s
-    - name: configure
-      run: cd src && ./configure --shared
-    - name: make depend
-      run: cd src && make clean && make depend
-    - name: make
-      run: cd src &&  make -j 3
-    - name: make test
-      run: cd src && make test
-    - name: upload logs if failure
-      if: ${{ failure() }}
-      uses: actions/upload-artifact@v4
-      with:
-         name: fail-logs
-         path: ${{ github.workspace }}/src/**/*testlog
diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml
deleted file mode 100644
index f63b761b5e2..00000000000
--- a/.github/workflows/docker-images.yml
+++ /dev/null
@@ -1,192 +0,0 @@
-name: Docker Image CI
-
-on:
-  schedule:
-  - cron: '37 2 * * 1'
-
-  workflow_dispatch:
-    inputs:
-      logLevel:
-        description: 'Log level'
-        required: true
-        default: 'warning'
-        type: choice
-        options:
-          - info
-          - warning
-          - debug
-
-#  pull_request: #for debugging purposes
-#    branches: [ "master" ]
-
-jobs:
-
-  enable_build:
-    #if: github.repository_owner == 'jtrmal' || github.repository_owner == 'kaldi-asr'
-    if: github.repository_owner == 'kaldi-asr'
-    runs-on: ubuntu-latest
-    outputs:
-      enabled: ${{ steps.set-enabled.outputs.enabled }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Set enabled
-        id: set-enabled
-        run: |
-          set -x
-          echo $(git rev-list  --after="1 week"  ${{ github.sha }})
-          if test -z $(git rev-list  --after="1 week"  ${{ github.sha }} | tail -n 1) ; then
-            enabled=false
-          else
-            enabled=true
-          fi
-          echo "enabled: $enabled"
-          echo "enabled=${enabled}" >> $GITHUB_OUTPUT
-
-
-  docker-buildx-gpu-12:
-    needs: enable_build
-    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Maximize build space
-        uses: AdityaGarg8/remove-unwanted-software@v4.1
-        with:
-          remove-android: 'true'
-          remove-dotnet: 'true'
-          remove-haskell: 'true'
-          remove-codeql: 'true'
-          remove-docker-images: 'true'
-          remove-large-packages: 'true'
-          remove-cached-tools: 'true'
-          remove-swapfile: 'false'
-          verbose: 'true'
-      - uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          install: true
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Inspect builder
-        run: |
-          echo "Name:      ${{ steps.buildx.outputs.name }}"
-          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
-          echo "Status:    ${{ steps.buildx.outputs.status }}"
-          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
-          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
-      - name: Build and push
-        run: |
-          cd docker/ubuntu22.04-cuda12
-          docker build --push  --tag kaldiasr/kaldi:gpu-latest --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12 --tag kaldiasr/kaldi:gpu-ubuntu22.04-cuda12-$(date +%F) .
-
-  docker-buildx-gpu-cuda11:
-    needs: enable_build
-    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Maximize build space
-        uses: AdityaGarg8/remove-unwanted-software@v4.1
-        with:
-          remove-android: 'true'
-          remove-dotnet: 'true'
-          remove-haskell: 'true'
-          remove-codeql: 'true'
-          remove-docker-images: 'true'
-          remove-large-packages: 'true'
-          remove-cached-tools: 'true'
-          remove-swapfile: 'false'
-          verbose: 'true'
-      - uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          install: true
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Inspect builder
-        run: |
-          echo "Name:      ${{ steps.buildx.outputs.name }}"
-          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
-          echo "Status:    ${{ steps.buildx.outputs.status }}"
-          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
-          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
-      - name: Build and push
-        run: |
-          cd docker/ubuntu20.04-cuda11
-          docker build --push  --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11 --tag kaldiasr/kaldi:gpu-ubuntu20.04-cuda11-$(date +%F) .
-
-  docker-buildx-cpu-openblas:
-    needs: enable_build
-    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          install: true
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Inspect builder
-        run: |
-          echo "Name:      ${{ steps.buildx.outputs.name }}"
-          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
-          echo "Status:    ${{ steps.buildx.outputs.status }}"
-          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
-          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
-      - name: Build and push
-        run: |
-          cd docker/debian12-cpu/
-          docker build --push \
-                       --tag kaldiasr/kaldi:latest \
-                       --tag kaldiasr/kaldi:cpu-latest  \
-                       --tag kaldiasr/kaldi:cpu-latest-openblas  \
-                       --tag kaldiasr/kaldi:cpu-debian12-openblas \
-                       --tag kaldiasr/kaldi:cpu-debian12-openblas-$(date +%F) .
-
-  docker-buildx-cpu-mkl:
-    needs: enable_build
-    if: needs.enable_build.outputs.enabled == 'true' || github.event_name == 'push' || github.event_name == 'workflow_dispatch'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          install: true
-      - name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Inspect builder
-        run: |
-          echo "Name:      ${{ steps.buildx.outputs.name }}"
-          echo "Endpoint:  ${{ steps.buildx.outputs.endpoint }}"
-          echo "Status:    ${{ steps.buildx.outputs.status }}"
-          echo "Flags:     ${{ steps.buildx.outputs.flags }}"
-          echo "Platforms: ${{ steps.buildx.outputs.platforms }}"
-      - name: Build and push
-        run: |
-          cd docker/debian12-cpu-mkl/
-          docker build --push \
-                       --tag kaldiasr/kaldi:cpu-latest-mkl  \
-                       --tag kaldiasr/kaldi:cpu-debian12-mkl \
-                       --tag kaldiasr/kaldi:cpu-debian12-mkl-$(date +%F) .
- 
- 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f26596ce86..e0ca3ea2871 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.13)
 project(kaldi)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
@@ -20,25 +20,7 @@ if(CONDA_ROOT)
 
 endif()
 
-option(BuildForFedora "Build for Fedora. Means that everything is build with Border tools" NO)
-
-if(BuildForFedora)
-
-    # You also need to install sudo dnf install lapack-devel openfst-devel
-    set(CMAKE_CXX_STANDARD 17)
-    set(CMAKE_CXX_STANDARD_REQUIRED ON)
-    set(CMAKE_CXX_EXTENSIONS OFF)
-
-    #find_package(PkgConfig REQUIRED)
-
-    #pkg_check_modules(FST REQUIRED fst)
-
-else()
-    include(third_party/get_third_party)
-
-    include(cmake/third_party/openfst.cmake)
-endif()
-
+include(third_party/get_third_party)
 
 find_package(PythonInterp)
 if(NOT PYTHON_EXECUTABLE)
@@ -59,11 +41,8 @@ execute_process(COMMAND ${PYTHON_EXECUTABLE}
 )
 unset(IS_LIB_SHARE)
 
-if(BuildForFedora)
-else()
-    set(CMAKE_CXX_STANDARD 14)
-    set(CMAKE_CXX_EXTENSIONS OFF)
-endif()
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_INSTALL_MESSAGE LAZY) # hide "-- Up-to-date: ..."
 if(BUILD_SHARED_LIBS)
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -247,19 +226,8 @@ endif()
 #            PATHS "${CMAKE_CURRENT_SOURCE_DIR}/tools/openfst/include"
 #            REQUIRED)
 
-if(BuildForFedora)
-    # Version used used by Fedora 41 is 1.83
-    # TODO: Detect the right version and put it here.
-    add_definitions(-DOPENFST_VER=18300)
-#    link_directories(/usr/lib64)
-#    include_directories(/usr/include/fst)    
-endif()
-
 link_libraries(fst)
 
-
-
-
 # add all native libraries
 add_subdirectory(src/base) # NOTE, we need to patch the target with version from outside
 set_property(TARGET kaldi-base PROPERTY COMPILE_DEFINITIONS "KALDI_VERSION=\"${KALDI_VERSION}\"")
diff --git a/COPYING b/COPYING
index 2b0dbd4243a..5a5cab00a29 100644
--- a/COPYING
+++ b/COPYING
@@ -57,72 +57,72 @@ License v 2.0 are set forth below.
 
 Individual Contributors (in alphabetical order)
 
-      Albert Vernon
-      Alexander Solovets
-      Allen Guo
-      Ariya Rastrow
-      Arnab Ghoshal
-      Cisco Corporation
-      Daniel Galvez
-      Daniel Povey
-      Danijel Korzinek
-      David Snyder
-      Dogan Can
-      Eduardo Silva
-      Ewald Enzinger
-      Gaofeng Cheng
-      Gaurav Kumar
-      Georg Stemmer
+      Mohit Agarwal
+      Tanel Alumae
       Gilles Boulianne
-      Go Vivace Inc.
+      Lukas Burget
+      Dogan Can
       Guoguo Chen
-      Haihua Xu
-      Hainan Xu
-      Hendy Irawan
-      Hossein Hadian
+      Gaofeng Cheng
+      Cisco Corporation
+      Pavel Denisov
       Ilya Edrenkin
-      Jan "Yenda" Trmal
-      Jan Silovsky
+      Ewald Enzinger
       Joachim Fainberg
-      Johns Hopkins University
-      Karel Vesely
-      Ke Li
-      Kirill Katsnelson
-      Lucas Ondel
-      Lukas Burget
+      Daniel Galvez
+      Pegah Ghahremani
+      Arnab Ghoshal
+      Ondrej Glembek
+      Go Vivace Inc.
+      Allen Guo
+      Hossein Hadian
       Lv Hang
-      Matthew Maciejewski
-      Microsoft Corporation
-      Minhua Wu
       Mirko Hannemann
-      Mohit Agarwal
+      Hendy Irawan
       Navdeep Jaitly
-      Nickolay V. Shmyrev
-      Omid Sadjadi
-      Ondrej Glembek
-      Ondrej Platek
-      Pavel Denisov
-      Pawel Swietojanski
-      Pegah Ghahremani
-      Peter Smit
-      Petr Motlicek
-      Petr Schwarz
-      Phonexia s.r.o.
-      Saarland University
-      Shinji Watanabe
+      Johns Hopkins University
       Shiyin Kang
-      Tanel Alumae
+      Kirill Katsnelson
       Tom Ko
-      Vassil Panayotov
-      Vijayaditya Peddinti
+      Danijel Korzinek
+      Gaurav Kumar
+      Ke Li
+      Matthew Maciejewski
       Vimal Manohar
-      Vincent Nguyen
-      Xiaohui Zhang
-      Xingyu Na
       Yajie Miao
+      Microsoft Corporation
+      Petr Motlicek
+      Xingyu Na
+      Vincent Nguyen
+      Lucas Ondel
+      Vassil Panayotov
+      Vijayaditya Peddinti
+      Phonexia s.r.o.
+      Ondrej Platek
+      Daniel Povey
       Yanmin Qian
-      Yiming Wang
+      Ariya Rastrow
+      Saarland University
+      Omid Sadjadi
+      Petr Schwarz
       Yiwen Shao
+      Nickolay V. Shmyrev
+      Jan Silovsky
+      Eduardo Silva
+      Peter Smit
+      David Snyder
+      Alexander Solovets
+      Georg Stemmer
+      Pawel Swietojanski
+      Jan "Yenda" Trmal
+      Albert Vernon
+      Karel Vesely
+      Yiming Wang
+      Shinji Watanabe
+      Minhua Wu
+      Haihua Xu
+      Hainan Xu
+      Xiaohui Zhang
 
 Other Source Material
 
diff --git a/README.md b/README.md
index 1a0d6ce0125..e915a3096e8 100644
--- a/README.md
+++ b/README.md
@@ -52,22 +52,6 @@ Development pattern for contributors
 Platform specific notes
 -----------------------
 
-### Fedora 41 (and later)
-
-In order to build it on Fedora 41 using the libraries that are provided by the distro, you need to install the development libraries and dependencies with
-
-```
-sudo dnf install lapack-devel openfst-devel
-```
-
-then build the package as follows:
-
-```
-cmake -S ./ -Bbuild/Release -DFETCHCONTENT_FULLY_DISCONNECTED=ON -DBuildForFedora=ON
-cmake --build /home/gerhard/workspace/kaldi/build/Release
-```
-
-
 ### PowerPC 64bits little-endian (ppc64le)
 
 - Kaldi is expected to work out of the box in RHEL >= 7 and Ubuntu >= 16.04 with
@@ -86,6 +70,6 @@ cmake --build /home/gerhard/workspace/kaldi/build/Release
 ### Web Assembly
 
 - Kaldi supports cross compiling for Web Assembly for in-browser execution
-  using [emscripten](https://emscripten.org) and OpenBLAS
-- See [this repo](https://github.com/msqr1/kaldi-wasm2)
+  using [emscripten](https://emscripten.org/) and CLAPACK.
+- See [this post](https://gitlab.inria.fr/kaldi.web/kaldi-wasm/-/wikis/build_details.md)
   for a step-by-step description of the build process.
diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
index c8fee4c415f..5925c6369a8 100644
--- a/cmake/gen_cmake_skeleton.py
+++ b/cmake/gen_cmake_skeleton.py
@@ -269,7 +269,7 @@ def gen_code(self):
 
         if len(self.depends) > 0:
             ret.append("target_link_libraries(" + self.target_name + " PUBLIC")
-            for d in self.depends + ['-lcblas', '-llapack']:
+            for d in self.depends:
                 ret.append("    " + d)
             ret.append(")\n")
 
diff --git a/docker/debian12-cpu-mkl/Dockerfile b/docker/debian10-cpu/Dockerfile
similarity index 52%
rename from docker/debian12-cpu-mkl/Dockerfile
rename to docker/debian10-cpu/Dockerfile
index aae82d24b93..05079922d03 100644
--- a/docker/debian12-cpu-mkl/Dockerfile
+++ b/docker/debian10-cpu/Dockerfile
@@ -1,10 +1,9 @@
-FROM debian:12
-LABEL maintainer="jtrmal@apptek.com"
+FROM debian:10
+LABEL maintainer="rick@scriptix.io"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         g++ \
-        gfortran \
         make \
         automake \
         autoconf \
@@ -14,21 +13,29 @@ RUN apt-get update && \
         sox \
         libtool \
         git \
+        subversion \
+        python2.7 \
         python3 \
         zlib1g-dev \
         ca-certificates \
+        gfortran \
         patch \
-        python-is-python3 && \
+        ffmpeg \
+	vim && \
     rm -rf /var/lib/apt/lists/*
 
+RUN ln -s /usr/bin/python3 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
 RUN    cd /opt/kaldi/tools && \
-       ./extras/install_mkl.sh  && \
-       make -j 5 && \
+       ./extras/install_mkl.sh && \
+       make -j $(nproc) && \
        cd /opt/kaldi/src && \
        ./configure --shared && \
        make depend -j $(nproc) && \
-       make -j 5 
-
+       make -j $(nproc) && \
+       find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+       find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+       find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
+       rm -rf /opt/kaldi/.git
 WORKDIR /opt/kaldi/
diff --git a/docker/debian12-cpu/Dockerfile b/docker/debian12-cpu/Dockerfile
deleted file mode 100644
index 6c286d6ba24..00000000000
--- a/docker/debian12-cpu/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-FROM debian:12
-LABEL maintainer="jtrmal@apptek.com"
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        g++ \
-        gfortran \
-        make \
-        automake \
-        autoconf \
-        bzip2 \
-        unzip \
-        wget \
-        sox \
-        libtool \
-        git \
-        python3 \
-        zlib1g-dev \
-        ca-certificates \
-        patch \
-        python-is-python3 && \
-    rm -rf /var/lib/apt/lists/*
-
-
-RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi #EOL
-RUN    cd /opt/kaldi/tools && \
-       ./extras/install_openblas.sh  && \
-       make -j 5 && \
-       cd /opt/kaldi/src && \
-       ./configure --shared --mathlib=OPENBLAS && \
-       make depend -j $(nproc) && \
-       make -j 5 
-
-WORKDIR /opt/kaldi/
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
new file mode 100644
index 00000000000..ba694d1fb96
--- /dev/null
+++ b/docker/debian9.8-cpu/Dockerfile
@@ -0,0 +1,43 @@
+
+FROM debian:9.8
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        gfortran \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared && \
+    make depend -j $(nproc) && \
+    make -j $(nproc) && \
+    find /opt/kaldi -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
+    rm -rf /opt/kaldi/.git
+WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu22.04-cuda12/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile
similarity index 61%
rename from docker/ubuntu22.04-cuda12/Dockerfile
rename to docker/ubuntu16.04-gpu/Dockerfile
index cb12b6abdd0..41fc78beb83 100644
--- a/docker/ubuntu22.04-cuda12/Dockerfile
+++ b/docker/ubuntu16.04-gpu/Dockerfile
@@ -1,39 +1,44 @@
-FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04
-LABEL maintainer="jtrmal@apptek.com"
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="mdoulaty@gmail.com"
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-        build-essential \
         g++ \
         make \
         automake \
+        autoconf \
         bzip2 \
         unzip \
         wget \
+        sox \
         libtool \
         git \
+        subversion \
+        python2.7 \
         python3 \
         zlib1g-dev \
-        ca-certificates \
         gfortran \
+        ca-certificates \
         patch \
-        sox \
-        software-properties-common && \
-        apt-add-repository multiverse && \
-        apt-get update && \
-        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
-            intel-mkl && \
+        ffmpeg \
+	vim && \
     rm -rf /var/lib/apt/lists/*
 
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda=yes && \
+    ./configure --shared --use-cuda && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
     rm -rf /opt/kaldi/.git
 
 WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0 b/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
new file mode 100644
index 00000000000..41fc78beb83
--- /dev/null
+++ b/docker/ubuntu16.04-gpu/ubuntu18.04-cuda10.0
@@ -0,0 +1,44 @@
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        gfortran \
+        ca-certificates \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc) && \
+    find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
+    rm -rf /opt/kaldi/.git
+
+WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu20.04-cuda11/Dockerfile b/docker/ubuntu18.04-cuda10.0/Dockerfile
similarity index 57%
rename from docker/ubuntu20.04-cuda11/Dockerfile
rename to docker/ubuntu18.04-cuda10.0/Dockerfile
index 81126cd96ac..0c75863fedd 100644
--- a/docker/ubuntu20.04-cuda11/Dockerfile
+++ b/docker/ubuntu18.04-cuda10.0/Dockerfile
@@ -1,40 +1,44 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-LABEL maintainer="jtrmal@apptek.com"
 
-ARG DEBIAN_FRONTEND=noninteractive
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
+LABEL maintainer="mdoulaty@gmail.com"
+
 RUN apt-get update && \
-    apt-get install -yqq --no-install-recommends \
-        build-essential \
+    apt-get install -y --no-install-recommends \
         g++ \
         make \
         automake \
+        autoconf \
         bzip2 \
         unzip \
         wget \
+        sox \
         libtool \
         git \
+        subversion \
+        python2.7 \
         python3 \
         zlib1g-dev \
-        ca-certificates \
         gfortran \
+        ca-certificates \
         patch \
-        sox \
-        software-properties-common && \
-        apt-add-repository multiverse && \
-        apt-get update && \
-        yes | DEBIAN_FRONTEND=noninteractive apt-get install -yqq --no-install-recommends\
-            intel-mkl && \
+        ffmpeg \
+	vim && \
     rm -rf /var/lib/apt/lists/*
 
+RUN ln -s /usr/bin/python2.7 /usr/bin/python
 
 RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
     cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
     make -j $(nproc) && \
     cd /opt/kaldi/src && \
-    ./configure --shared --use-cuda=yes && \
+    ./configure --shared --use-cuda && \
     make depend -j $(nproc) && \
     make -j $(nproc) && \
     find /opt/kaldi  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    find /opt/intel -type f -name "*.a" -exec rm {} \; && \
+    find /opt/intel -type f -regex '.*\(_mc.?\|_mic\|_thread\|_ilp64\)\.so' -exec rm {} \; && \
     rm -rf /opt/kaldi/.git
 
 WORKDIR /opt/kaldi/
+
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index ed91a980791..0d40d25c23a 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh
index 0cc76a56dd0..4389c6b5d81 100755
--- a/egs/ami/s5/run_mdm.sh
+++ b/egs/ami/s5/run_mdm.sh
@@ -10,7 +10,7 @@ mic=mdm$nmics
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh
index a212a8846b2..17e2071f1f6 100755
--- a/egs/ami/s5/run_sdm.sh
+++ b/egs/ami/s5/run_sdm.sh
@@ -17,7 +17,7 @@ set -euxo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5b/cmd.sh b/egs/ami/s5b/cmd.sh
index a8ea5d7c1ba..b004c5569df 100644
--- a/egs/ami/s5b/cmd.sh
+++ b/egs/ami/s5b/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G"
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1 --mem 20G"
 
-if [[ "$(hostname -d)" == "fit.vutbr.cz" ]]; then
+if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
diff --git a/egs/ami/s5b/conf/ami_beamformit.cfg b/egs/ami/s5b/conf/ami_beamformit.cfg
deleted file mode 100644
index 70fdd858651..00000000000
--- a/egs/ami/s5b/conf/ami_beamformit.cfg
+++ /dev/null
@@ -1,50 +0,0 @@
-#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
-
-# scrolling size to compute the delays
-scroll_size = 250
-
-# cross correlation computation window size
-window_size = 500
-
-#amount of maximum points for the xcorrelation taken into account
-nbest_amount = 4
-
-#flag wether to apply an automatic noise thresholding 
-do_noise_threshold = 1
-
-#Percentage of frames with lower xcorr taken as noisy
-noise_percent = 10
-
-######## acoustic modelling parameters
-
-#transition probabilities weight for multichannel decoding
-trans_weight_multi = 25
-trans_weight_nbest = 25
-
-###
-
-#flag wether to print the feaures after setting them, or not
-print_features = 1
-
-#flag wether to use the bad frames in the sum process
-do_avoid_bad_frames = 1
-
-#flag to use the best channel (SNR) as a reference
-#defined from command line
-do_compute_reference = 1
-
-#flag wether to use a uem file or not(process all the file)
-do_use_uem_file = 0
-
-#flag wether to use an adaptative weights scheme or fixed weights
-do_adapt_weights = 1
-
-#flag wether to output the sph files or just run the system to create the auxiliary files
-do_write_sph_files = 1
-
-####directories where to store/retrieve info####
-#channels_file = ./cfg-files/channels
-
-#show needs to be passed as argument normally, here a default one is given just in case
-#show_id = Ttmp
-
diff --git a/egs/ami/s5b/run.sh b/egs/ami/s5b/run.sh
index 94cd81f230b..79989f17004 100755
--- a/egs/ami/s5b/run.sh
+++ b/egs/ami/s5b/run.sh
@@ -28,7 +28,7 @@ set -euo pipefail
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
diff --git a/egs/ami/s5c/run.sh b/egs/ami/s5c/run.sh
index 1281cad2e43..cc4cd87610b 100755
--- a/egs/ami/s5c/run.sh
+++ b/egs/ami/s5c/run.sh
@@ -3,7 +3,7 @@
 # Apache 2.0.
 #
 # This recipe performs diarization for the mix-headset data in the
-# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2
+# AMI dataset. The x-vector extractor we use is trained on VoxCeleb v2 
 # corpus with simulated RIRs. We use oracle SAD in this recipe.
 # This recipe demonstrates the following:
 # 1. Diarization using x-vector and clustering (AHC, VBx, spectral)
@@ -38,7 +38,7 @@ diarizer_type=spectral  # must be one of (ahc, spectral, vbx)
 # Path where AMI gets downloaded (or where locally available):
 AMI_DIR=$PWD/wav_db # Default,
 case $(hostname -d) in
-  fit.vutbr.cz) AMI_DIR=/mnt/matylda2/data/AMI_KALDI_DOWNLOAD ;; # BUT,
+  fit.vutbr.cz) AMI_DIR=/mnt/matylda5/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora5/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
 esac
@@ -57,7 +57,7 @@ if [ $stage -le 1 ]; then
   local/ami_download.sh $mic $AMI_DIR
 fi
 
-# Prepare data directories.
+# Prepare data directories. 
 if [ $stage -le 2 ]; then
   # Download the data split and references from BUT's AMI setup
   if ! [ -d AMI-diarization-setup ]; then
@@ -120,7 +120,7 @@ if [ $stage -le 6 ]; then
      transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- |\
       ivector-normalize-length ark:-  ark:- |" \
     $model_dir/xvectors_plda_train/plda || exit 1;
-
+  
   cp $model_dir/xvectors_plda_train/plda $model_dir/
   cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
   cp $model_dir/xvectors_plda_train/mean.vec $model_dir/
diff --git a/egs/babel/s5d/local/syllab/lattice_word2syll.sh b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
index 6e20e78ff73..c643b55d527 100755
--- a/egs/babel/s5d/local/syllab/lattice_word2syll.sh
+++ b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
@@ -30,25 +30,25 @@ if [ -f $olang/lex.words2syllabs.fst ] ; then
 
   $cmd JOB=1:$nj $output/log/convert.JOB.log \
     lattice-push --push-strings ark:"gunzip -c $input/lat.JOB.gz|" ark:- \| \
-      lattice-lmrescore --lm-scale=-1.0 ark:- "fstproject --project_output=true $ilang/G.fst|" ark:- \| \
+      lattice-lmrescore --lm-scale=-1.0 ark:- "fstproject --project_type=output $ilang/G.fst|" ark:- \| \
       lattice-compose ark:- $output/L.fst  ark:- \| \
       lattice-determinize-pruned --beam=8 --acoustic-scale=0.1 ark:-  ark:- \| \
       lattice-minimize ark:- "ark:|gzip -c > $output/lat.JOB.gz"
       #lattice-minimize ark:- ark:- \| \
-      #lattice-lmrescore --lm-scale=1.0 ark:- "fstproject --project_output=true $olang/G.fst|" "ark:|gzip -c > $output/lat.JOB.gz"
+      #lattice-lmrescore --lm-scale=1.0 ark:- "fstproject --project_type=output $olang/G.fst|" "ark:|gzip -c > $output/lat.JOB.gz"
 else
   #for phonemes.... (IIRC)
   fstreverse $olang/L.fst | fstminimize | fstreverse > $output/L.fst
   $cmd JOB=1:$nj $output/log/convert.JOB.log \
     lattice-push --push-strings ark:"gunzip -c $input/lat.JOB.gz|" ark:- \| \
-      lattice-lmrescore --lm-scale=-1.0 ark:- "fstproject --project_output=true $ilang/G.fst|" ark:- \| \
+      lattice-lmrescore --lm-scale=-1.0 ark:- "fstproject --project_type=output $ilang/G.fst|" ark:- \| \
       lattice-align-words $ilang/phones/word_boundary.int $input/../final.mdl ark:- ark:-  \| \
       lattice-to-phone-lattice --replace-words $input/../final.mdl ark:- ark:- \| \
       lattice-align-phones $input/../final.mdl  ark:- ark:- \| \
       lattice-compose ark:- $output/L.fst ark:- \|\
       lattice-determinize-pruned --beam=$beam --acoustic-scale=$acwt ark:-  ark:-\| \
       lattice-minimize ark:- "ark:|gzip -c > $output/lat.JOB.gz"
-      #lattice-lmrescore --lm-scale=1.0 ark:- "fstproject --project_output=true $olang/G.fst|" ark:"|gzip -c > $output/lat.JOB.gz"
+      #lattice-lmrescore --lm-scale=1.0 ark:- "fstproject --project_type=output $olang/G.fst|" ark:"|gzip -c > $output/lat.JOB.gz"
 fi
 
   #lattice-1best ark:- ark:-| nbest-to-linear ark:- ark:/dev/null ark,t:- \
diff --git a/egs/gop_speechocean762/README.md b/egs/gop_speechocean762/README.md
index 1c39f2f1cc6..77b520eadee 100644
--- a/egs/gop_speechocean762/README.md
+++ b/egs/gop_speechocean762/README.md
@@ -1,3 +1,8 @@
+There is a copy of this document on Google Docs, which renders the equations better:
+[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing)
+
+* * *
+
 # GOP on Kaldi
 
 The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring.
diff --git a/egs/gop_speechocean762/s5/local/visualize_feats.py b/egs/gop_speechocean762/s5/local/visualize_feats.py
index 202c6a57b6b..3b3ddaa037a 100644
--- a/egs/gop_speechocean762/s5/local/visualize_feats.py
+++ b/egs/gop_speechocean762/s5/local/visualize_feats.py
@@ -8,7 +8,6 @@
 import random
 import kaldi_io
 import seaborn as sns
-import numpy as np
 from collections import Counter
 from sklearn.manifold import TSNE
 from utils import load_human_scores, load_phone_symbol_table
@@ -63,9 +62,6 @@ def main():
                                   min(args.samples, len(lables)))
     features, lables = list(zip(*sampled_paris))
 
-    # Convert the tuple of arrays to a single 2D array
-    features = np.vstack(features)
-
     # Draw scatters
     label_counter = Counter(lables)
     colors = sns.color_palette("colorblind", len(label_counter))
diff --git a/egs/gop_speechocean762/s5/run.sh b/egs/gop_speechocean762/s5/run.sh
index 989d247736f..cf081a18133 100755
--- a/egs/gop_speechocean762/s5/run.sh
+++ b/egs/gop_speechocean762/s5/run.sh
@@ -2,7 +2,6 @@
 
 # Copyright      2019  Junbo Zhang
 #           2020-2021  Xiaomi Corporation (Author: Junbo Zhang, Yongqing Wang)
-#                2024  Jiun-Ting Li (National Taiwan Normal University)
 # Apache 2.0
 
 # This script shows how to calculate Goodness of Pronunciation (GOP) and
@@ -176,7 +175,6 @@ if [ $stage -le 12 ]; then
       compute-gop --phone-map=data/lang_nosp/phone-to-pure-phone.int \
         --skip-phones-string=0:1:2 \
         $model/final.mdl \
-        "ark,t:gunzip -c exp/ali_$part/ali.JOB.gz|" \
         "ark,t:gunzip -c exp/ali_$part/ali-phone.JOB.gz|" \
         "ark:exp/probs_$part/output.JOB.ark" \
         "ark,scp:exp/gop_$part/gop.JOB.ark,exp/gop_$part/gop.JOB.scp" \
diff --git a/egs/gp/s1/utils/lmrescore.sh b/egs/gp/s1/utils/lmrescore.sh
index 9e706395c4f..1a73f0c04a0 100755
--- a/egs/gp/s1/utils/lmrescore.sh
+++ b/egs/gp/s1/utils/lmrescore.sh
@@ -85,8 +85,8 @@ newlm=$newlang/G.fst
 ! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1;
 
 
-oldlmcommand="fstproject --project_output=true $oldlm |"
-newlmcommand="fstproject --project_output=true $newlm |"
+oldlmcommand="fstproject --project_type=output $oldlm |"
+newlmcommand="fstproject --project_type=output $newlm |"
 
 mkdir -p $outdir;
 
@@ -124,10 +124,10 @@ case "$mode" in
     submit_jobs.sh "$qcmd" --njobs=$nj --log=$outdir/rescorelm.TASK_ID.log \
       $sjopts gunzip -c $lat \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \
+      lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \
       \| lattice-determinize ark:- ark:- \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \
+      lattice-compose ark:- "fstproject --project_type=output $newlm |" ark:- \
       \| lattice-determinize ark:- ark:- \| \
       gzip -c \>$newlat || error_exit "Error doing LM rescoring."
     ;;
@@ -138,7 +138,7 @@ case "$mode" in
     submit_jobs.sh "$qcmd" --njobs=$nj --log=$outdir/rescorelm.TASK_ID.log \
       $sjopts gunzip -c $lat \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \
+      lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \
       \| \ lattice-determinize ark:- ark:- \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
diff --git a/egs/librispeech/s5/fairseq_ltlm/recipes/scripts/prepare_egs.sh b/egs/librispeech/s5/fairseq_ltlm/recipes/scripts/prepare_egs.sh
index 99ac1ada7b0..0d7ed563bf8 100755
--- a/egs/librispeech/s5/fairseq_ltlm/recipes/scripts/prepare_egs.sh
+++ b/egs/librispeech/s5/fairseq_ltlm/recipes/scripts/prepare_egs.sh
@@ -100,7 +100,7 @@ fi
 
 if [ -f $g_fst ] && [ "$g_fst_weight" != "0" ] ; then
 	echo "Applying negative rescoring with lm $g_fst, weight $g_fst_weight"
-	lattice_reader="gunzip -c $prunned_lats/lat.JOB.gz | lattice-lmrescore --lm-scale=$g_fst_weight ark:- 'fstproject --project_output=true $g_fst |' ark,t:-"
+	lattice_reader="gunzip -c $prunned_lats/lat.JOB.gz | lattice-lmrescore --lm-scale=$g_fst_weight ark:- 'fstproject --project_type=output $g_fst |' ark,t:-"
 else
 	lattice_reader="gunzip -c $prunned_lats/lat.JOB.gz | lattice-copy ark:- ark,t:- "
 fi
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
index a4edeb8091c..c3a9e3905ae 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
@@ -160,7 +160,7 @@ if [ $stage -le 6 ]; then
   echo "$0: will print costs with the two FSTs, for one random path."
   fstrandgen $tree_dir/grammar1/HCLG.fst > path.fst
   for x in 1 2; do
-    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_output=true path.fst) > composed.fst
+    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_type=output path.fst) > composed.fst
     start_state=$(fstprint composed.fst | head -n 1 | awk '{print $1}')
     fstshortestdistance --reverse=true composed.fst | awk -v s=$start_state '{if($1 == s) { print $2; }}'
   done
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
index 414227f2ad6..7c7232055b3 100755
--- a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
@@ -158,7 +158,7 @@ if [ $stage -le 6 ]; then
   echo "$0: will print costs with the two FSTs, for one random path."
   fstrandgen $tree_dir/grammar1/HCLG.fst > path.fst
   for x in 1 2; do
-    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_output=true path.fst) > composed.fst
+    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_type=output path.fst) > composed.fst
     start_state=$(fstprint composed.fst | head -n 1 | awk '{print $1}')
     fstshortestdistance --reverse=true composed.fst | awk -v s=$start_state '{if($1 == s) { print $2; }}'
   done
diff --git a/egs/wsj/s5/steps/decode_biglm.sh b/egs/wsj/s5/steps/decode_biglm.sh
index f57191ed290..c4f3980bd08 100755
--- a/egs/wsj/s5/steps/decode_biglm.sh
+++ b/egs/wsj/s5/steps/decode_biglm.sh
@@ -73,8 +73,8 @@ esac
 
 # fstproject replaces the disambiguation symbol #0, which only appears on the
 # input side, with the <eps> that appears in the corresponding arcs on the output side.
-oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |"
-newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |"
+oldlm_cmd="fstproject --project_type=output $oldlm_fst | fstarcsort --sort_type=ilabel |"
+newlm_cmd="fstproject --project_type=output $newlm_fst | fstarcsort --sort_type=ilabel |"
 
 $cmd JOB=1:$nj $dir/log/decode.JOB.log \
  gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$lattice_beam \
diff --git a/egs/wsj/s5/steps/decode_fromlats.sh b/egs/wsj/s5/steps/decode_fromlats.sh
index 4822953ea0e..af04948486e 100755
--- a/egs/wsj/s5/steps/decode_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_fromlats.sh
@@ -77,7 +77,7 @@ esac
 
 $cmd JOB=1:$nj $dir/log/decode_lats.JOB.log \
  lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
-  fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
+  fsttablecompose "fstproject --project_type=output $lang/G.fst | fstarcsort |" ark:- ark:- \| \
   fstdeterminizestar ark:- ark:- \| \
   compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
     --batch-size=$batch_size $scale_opts $srcdir/tree $srcdir/final.mdl $lang/L_disambig.fst ark:- ark:- \|  \
diff --git a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
index 8fd5c29aa50..703e71b3b57 100755
--- a/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
+++ b/egs/wsj/s5/steps/decode_sgmm2_fromlats.sh
@@ -134,7 +134,7 @@ fi
 if [ $stage -le 2 ]; then
   $cmd JOB=1:$nj $dir/log/decode_pass1.JOB.log \
     lattice-to-fst "ark:gunzip -c $olddir/lat.JOB.gz|" ark:- \| \
-    fsttablecompose "fstproject --project_output=true $lang/G.fst | fstarcsort |" ark:- ark:- \| \
+    fsttablecompose "fstproject --project_type=output $lang/G.fst | fstarcsort |" ark:- ark:- \| \
     fstdeterminizestar ark:- ark:- \| \
     compile-train-graphs-fsts --read-disambig-syms=$lang/phones/disambig.int \
       --batch-size=$batch_size $scale_opts \
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index 4fa63e613a3..aed341bb8d9 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -49,8 +49,8 @@ if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
   echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
 fi
 
-oldlmcommand="fstproject --project_output=true $oldlm |"
-newlmcommand="fstproject --project_output=true $newlm |"
+oldlmcommand="fstproject --project_type=output $oldlm |"
+newlmcommand="fstproject --project_type=output $newlm |"
 
 mkdir -p $outdir/log
 
@@ -84,10 +84,10 @@ case "$mode" in
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+      lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \
+      lattice-compose ark:- "fstproject --project_type=output $newlm |" ark:- \| \
       lattice-determinize ark:- ark:- \| \
       gzip -c \>$outdir/lat.JOB.gz || exit 1;
     ;;
@@ -98,7 +98,7 @@ case "$mode" in
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
-      lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+      lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
index 3106261389e..34ecfc9079b 100755
--- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh
+++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
@@ -45,7 +45,7 @@ if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
   echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
 fi
 
-oldlmcommand="fstproject --project_output=true $oldlm |"
+oldlmcommand="fstproject --project_type=output $oldlm |"
 
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
index 7d4b983e761..b97c9f4ec9c 100755
--- a/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
+++ b/egs/wsj/s5/steps/lmrescore_const_arpa_undeterminized.sh
@@ -70,7 +70,7 @@ if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
   echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
 fi
 
-oldlmcommand="fstproject --project_output=true $oldlm |"
+oldlmcommand="fstproject --project_type=output $oldlm |"
 
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
index 633be09f2bf..f7b17f1342c 100755
--- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
+++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -71,7 +71,7 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
-oldlm_command="fstproject --project_output=true $oldlm |"
+oldlm_command="fstproject --project_type=output $oldlm |"
 
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/decode_lookahead.sh b/egs/wsj/s5/steps/nnet3/decode_lookahead.sh
index 47f13dffc07..8c696c64aa7 100755
--- a/egs/wsj/s5/steps/nnet3/decode_lookahead.sh
+++ b/egs/wsj/s5/steps/nnet3/decode_lookahead.sh
@@ -20,10 +20,6 @@ min_active=200
 ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
-use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
-              # In that case it is recommended to set num-threads to a large
-              # number, e.g. 20 if you have that many free CPU slots on a GPU
-              # node, and to use a small number of jobs.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -52,10 +48,6 @@ if [ $# -ne 3 ]; then
   echo "  --beam <beam>                            # Decoding beam; default 15.0"
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
-  echo "  --num-threads <n>                        # number of threads to use, default 1."
-  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
-  echo "                                           # to use large --num-threads as the graph"
-  echo "                                           # search becomes the limiting factor."
   exit 1;
 fi
 
@@ -80,7 +72,6 @@ done
 
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
-thread_string=
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
diff --git a/egs/wsj/s5/steps/pytorchnn/lmrescore_lattice_pytorchnn.sh b/egs/wsj/s5/steps/pytorchnn/lmrescore_lattice_pytorchnn.sh
index 2e6f5538e86..ccf4fc72cd0 100755
--- a/egs/wsj/s5/steps/pytorchnn/lmrescore_lattice_pytorchnn.sh
+++ b/egs/wsj/s5/steps/pytorchnn/lmrescore_lattice_pytorchnn.sh
@@ -124,7 +124,7 @@ fi
 
 # Rescore the expanded lattice: add neural LM scores first and then remove the
 # old N-gram LM scores. The two models are effectively interpolated.
-oldlm_command="fstproject --project_output=true $oldlm |"
+oldlm_command="fstproject --project_type=output $oldlm |"
 oldlm_weight=$(perl -e "print -1.0 * $weight;")
 nnlm_weight=$(perl -e "print $weight;")
 if [ $stage -le 4 ]; then
diff --git a/egs/wsj/s5/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh b/egs/wsj/s5/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh
index f8f2252537c..842f5c868ff 100755
--- a/egs/wsj/s5/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh
+++ b/egs/wsj/s5/steps/pytorchnn/lmrescore_nbest_pytorchnn.sh
@@ -128,7 +128,7 @@ if [ "$oldlm" == "$oldlang/G.fst" ]; then
       # original lattice.
       $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
         lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
-        lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+        lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \| \
         lattice-1best ark:- ark:- \| \
         lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
         || exit 1;
diff --git a/egs/wsj/s5/steps/rnnlmrescore.sh b/egs/wsj/s5/steps/rnnlmrescore.sh
index de6114038b8..8d84d407f7a 100755
--- a/egs/wsj/s5/steps/rnnlmrescore.sh
+++ b/egs/wsj/s5/steps/rnnlmrescore.sh
@@ -127,7 +127,7 @@ if [ "$oldlm" == "$oldlang/G.fst" ]; then
       # original lattice.
       $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
         lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
-        lattice-compose ark:- "fstproject --project_output=true $oldlm |" ark:- \| \
+        lattice-compose ark:- "fstproject --project_type=output $oldlm |" ark:- \| \
         lattice-1best ark:- ark:- \| \
         lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
         || exit 1;
diff --git a/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
index 437549f339f..21372b3cb89 100644
--- a/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
+++ b/egs/wsj/s5/steps/tfrnnlm/lmrescore_rnnlm_lat.sh
@@ -65,7 +65,7 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
   print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
   || exit 1;
 
-oldlm_command="fstproject --project_output=true $oldlm |"
+oldlm_command="fstproject --project_type=output $oldlm |"
 
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 051715f2b1e..ed4710d0b1f 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -54,7 +54,7 @@ function check_sorted {
 }
 
 for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
-    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames $utt_extra_files $spk_extra_files; do
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x
diff --git a/egs/wsj/s5/utils/lang/make_unk_lm.sh b/egs/wsj/s5/utils/lang/make_unk_lm.sh
index f3a41e1af4e..1160214faec 100755
--- a/egs/wsj/s5/utils/lang/make_unk_lm.sh
+++ b/egs/wsj/s5/utils/lang/make_unk_lm.sh
@@ -304,7 +304,7 @@ fstcompile $sym_opts <$dir/unk_fst_orig.txt >$dir/unk_orig.fst
 # a lot of final-states that have no transitions out of them.
 fstproject $dir/unk_orig.fst | \
   fstcompose - $dir/constraint.fst | \
-  fstproject --project_output=true | \
+  fstproject --project_type=output | \
   fstpushspecial | \
   fstminimizeencoded | \
   fstrmsymbols --remove-from-output=true <(echo $phone_disambig_int) >$dir/unk.fst
diff --git a/egs/wsj/s5/utils/mkgraph_lookahead.sh b/egs/wsj/s5/utils/mkgraph_lookahead.sh
index 33280f13a65..a89fcfa414d 100755
--- a/egs/wsj/s5/utils/mkgraph_lookahead.sh
+++ b/egs/wsj/s5/utils/mkgraph_lookahead.sh
@@ -147,21 +147,21 @@ if [[ -z $arpa ]]; then
       [ ! -f $lang/oov.int ] && \
         echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1;
       fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $gr | \
-        fstrelabel --relabel_ipairs=${dir}/relabel | \
+        fstrelabel --relabel_ipairs=${dir}/relabel --relabel_opairs=${dir}/relabel | \
         fstarcsort --sort_type=ilabel | \
         fstconvert --fst_type=const > ${dir}/Gr.fst.$$
     else
-      fstrelabel --relabel_ipairs=${dir}/relabel "$gr" | \
+      fstrelabel --relabel_ipairs=${dir}/relabel --relabel_opairs=${dir}/relabel "$gr" | \
         fstarcsort --sort_type=ilabel | \
         fstconvert --fst_type=const > ${dir}/Gr.fst.$$
     fi
     mv $dir/Gr.fst.$$ $dir/Gr.fst
-    cp $lang/words.txt $dir/ || exit 1;
+    utils/relabel_words.py ${dir}/relabel ${lang}/words.txt > ${dir}/words.txt
   fi
 else
   if [[ ! -s $dir/Gr.fst || $dir/Gr.fst -ot $arpa ]]; then
     # Opengrm builds acceptors, so we need to reorder words in symboltable
-    utils/apply_map.pl --permissive -f 2 ${dir}/relabel < ${lang}/words.txt > ${dir}/words.txt
+    utils/relabel_words.py ${dir}/relabel ${lang}/words.txt > ${dir}/words.txt
     gunzip -c $arpa | ngramread --OOV_symbol=`cat ${lang}/oov.txt` --symbols=${dir}/words.txt --ARPA | \
     fstarcsort --sort_type=ilabel | \
       fstconvert --fst_type=ngram > ${dir}/Gr.fst.$$
diff --git a/egs/wsj/s5/utils/relabel_words.py b/egs/wsj/s5/utils/relabel_words.py
new file mode 100755
index 00000000000..cc2048d6bc6
--- /dev/null
+++ b/egs/wsj/s5/utils/relabel_words.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# Relabel words for lookahead
+
+import sys
+
+lmap = {}
+for line in open(sys.argv[1]):
+    items = line.split()
+    lmap[items[0]] = items[1]
+
+for line in open(sys.argv[2]):
+    line = line.strip()
+    word, id = line.split()
+    if word in set(["<eps>", "<s>", "</s>"]):
+        print (line)
+    else:
+        print (word, lmap[id])
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
index 2a5750c9238..aa0163235a6 100755
--- a/egs/wsj/s5/utils/subword/prepare_subword_text.sh
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -36,7 +36,7 @@ grep -q $separator $word_text && echo "$0: Error, word text file contains separa
 glossaries_opt=
 [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
 cut -d ' ' -f2- $word_text | \
-  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaries_opt > ${word_text}.sub
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
   if [ $word_text == $subword_text ]; then
     mv $word_text ${word_text}.old
     cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
diff --git a/egs/xbmu_amdo31/README.txt b/egs/xbmu_amdo31/README.txt
deleted file mode 100644
index d2cda16fa58..00000000000
--- a/egs/xbmu_amdo31/README.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-About the XBMU-AMDO31 corpus XBMU-AMDO31 is an open-source Amdo Tibetan speech corpus published by Northwest Minzu University. 
-
-XBMU-AMDO31 dataset is a speech recognition corpus of Tibetan Amdo dialect. The open source corpus contains 31 hours of speech data and resources related to build speech recognition systems,including transcribed texts and a Tibetan pronunciation lexicon. (The lexicon is a Tibetan lexicon of the Lhasa dialect, which has been reused for the Amdo dialect because of the uniformity of the Tibetan language) The dataset can be used to train a model for Amdo Tibetan Automatic Speech Recognition (ASR).
-
-The database can be downloaded from openslr:
-http://www.openslr.org/133/
-
-For more details, please visit: 
-https://huggingface.co/datasets/syzym/xbmu_amdo31
-
-This recipe includes some different ASR models trained with XBMU-AMDO31.
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/RESULTS b/egs/xbmu_amdo31/s5/RESULTS
deleted file mode 100644
index e50e43dc4db..00000000000
--- a/egs/xbmu_amdo31/s5/RESULTS
+++ /dev/null
@@ -1,8 +0,0 @@
-%WER 46.16 [ 15522 / 33628, 380 ins, 2208 del, 12934 sub ] exp/mono/decode_test/wer_10_0.0
-%WER 24.60 [ 8274 / 33628, 330 ins, 860 del, 7084 sub ] exp/tri1/decode_test/wer_13_0.0
-%WER 24.42 [ 8213 / 33628, 323 ins, 847 del, 7043 sub ] exp/tri2/decode_test/wer_13_0.0
-%WER 22.93 [ 7712 / 33628, 336 ins, 814 del, 6562 sub ] exp/tri3a/decode_test/wer_12_0.0
-%WER 20.17 [ 6783 / 33628, 275 ins, 764 del, 5744 sub ] exp/tri4a/decode_test/wer_15_0.0
-%WER 19.03 [ 6400 / 33628, 292 ins, 667 del, 5441 sub ] exp/tri5a/decode_test/wer_14_0.0
-%WER 15.45 [ 5196 / 33628, 229 ins, 646 del, 4321 sub ] exp/nnet3/tdnn_sp/decode_test/wer_16_0.0
-%WER 15.57 [ 5235 / 33628, 244 ins, 575 del, 4416 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_11_0.0
diff --git a/egs/xbmu_amdo31/s5/cmd.sh b/egs/xbmu_amdo31/s5/cmd.sh
deleted file mode 100644
index 71dd849a93b..00000000000
--- a/egs/xbmu_amdo31/s5/cmd.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can change cmd.sh depending on what type of queue you are using.
-# If you have no queueing system and want to run on a local machine, you
-# can change all instances 'queue.pl' to run.pl (but be careful and run
-# commands one by one: most recipes will exhaust the memory on your
-# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
-# with slurm.  Different queues are configured differently, with different
-# queue names and different ways of specifying things like memory;
-# to account for these differences you can create and edit the file
-# conf/queue.conf to match your queue's configuration.  Search for
-# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
-# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
-
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/xbmu_amdo31/s5/conf/decode.config b/egs/xbmu_amdo31/s5/conf/decode.config
deleted file mode 100644
index d91f86183af..00000000000
--- a/egs/xbmu_amdo31/s5/conf/decode.config
+++ /dev/null
@@ -1,5 +0,0 @@
-beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
-first_beam=8.0 # beam for 1st-pass decoding in SAT.
-
-
-
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc.conf b/egs/xbmu_amdo31/s5/conf/mfcc.conf
deleted file mode 100644
index a1aa3d6c158..00000000000
--- a/egs/xbmu_amdo31/s5/conf/mfcc.conf
+++ /dev/null
@@ -1,2 +0,0 @@
---use-energy=false   # only non-default option.
---sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf b/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
deleted file mode 100644
index ca067e77b37..00000000000
--- a/egs/xbmu_amdo31/s5/conf/mfcc_hires.conf
+++ /dev/null
@@ -1,10 +0,0 @@
-# config for high-resolution MFCC features, intended for neural network training.
-# Note: we keep all cepstra, so it has the same info as filterbank features,
-# but MFCC is more easily compressible (because less correlated) which is why
-# we prefer this method.
---use-energy=false   # use average of log energy, not energy.
---sample-frequency=16000 #  Switchboard is sampled at 8kHz
---num-mel-bins=40     # similar to Google's setup.
---num-ceps=40     # there is no dimensionality reduction.
---low-freq=40    # low cutoff frequency for mel bins
---high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf b/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
deleted file mode 100644
index 591367e7ae9..00000000000
--- a/egs/xbmu_amdo31/s5/conf/online_cmvn.conf
+++ /dev/null
@@ -1 +0,0 @@
-# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/xbmu_amdo31/s5/conf/online_pitch.conf b/egs/xbmu_amdo31/s5/conf/online_pitch.conf
deleted file mode 100644
index c0f1342160d..00000000000
--- a/egs/xbmu_amdo31/s5/conf/online_pitch.conf
+++ /dev/null
@@ -1,4 +0,0 @@
---sample-frequency=16000
---simulate-first-pass-online=true
---normalization-right-context=25
---frames-per-chunk=10
diff --git a/egs/xbmu_amdo31/s5/conf/pitch.conf b/egs/xbmu_amdo31/s5/conf/pitch.conf
deleted file mode 100644
index e959a19d5b8..00000000000
--- a/egs/xbmu_amdo31/s5/conf/pitch.conf
+++ /dev/null
@@ -1 +0,0 @@
---sample-frequency=16000
diff --git a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
deleted file mode 120000
index 34499362831..00000000000
--- a/egs/xbmu_amdo31/s5/local/chain/run_tdnn.sh
+++ /dev/null
@@ -1 +0,0 @@
-tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
deleted file mode 100755
index 826aa163f2a..00000000000
--- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_1a.sh
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is based on run_tdnn_7h.sh in swbd chain recipe.
-
-set -e
-
-# configs for 'chain'
-affix=
-stage=0
-train_stage=-10
-get_egs_stage=-10
-dir=exp/chain/tdnn_1a  # Note: _sp will get added to this
-decode_iter=
-
-# training options
-num_epochs=4
-initial_effective_lrate=0.001
-final_effective_lrate=0.0001
-max_param_change=2.0
-final_layer_normalize_target=0.5
-num_jobs_initial=1
-num_jobs_final=2
-minibatch_size=128
-frames_per_eg=150,110,90
-remove_egs=true
-common_egs_dir=
-xent_regularize=0.1
-
-# End configuration section.
-echo "$0 $*"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-dir=${dir}${affix:+_$affix}_sp
-train_set=train_sp
-ali_dir=exp/tri5a_sp_ali
-treedir=exp/chain/tri6_7d_tree_sp
-lang=data/lang_chain
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-#local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 7 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat $ali_dir/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri5a exp/tri5a_sp_lats
-  rm exp/tri5a_sp_lats/fsts.*.gz # save space
-fi
-
-if [ $stage -le 8 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 9 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 10 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=43 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=625
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
-  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
-  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
-  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
-
-  ## adding the layers for chain branch
-  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 11 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate $final_effective_lrate \
-    --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri5a_sp_lats \
-    --dir $dir  || exit 1;
-fi
-
-if [ $stage -le 12 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
-fi
-
-graph_dir=$dir/graph
-if [ $stage -le 13 ]; then
-  for test_set in dev test; do
-    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-      --nj 5 --cmd "$decode_cmd" \
-      --online-ivector-dir exp/nnet3/ivectors_$test_set \
-      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
-  done
-fi
-
-exit;
diff --git a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
deleted file mode 100755
index 52d56adbc60..00000000000
--- a/egs/xbmu_amdo31/s5/local/chain/tuning/run_tdnn_2a.sh
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is based on run_tdnn_1a.sh.
-# This setup used online pitch to train the neural network.
-# It requires a online_pitch.conf in the conf dir.
-
-set -e
-
-# configs for 'chain'
-affix=
-stage=0
-train_stage=-10
-get_egs_stage=-10
-dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
-decode_iter=
-
-# training options
-num_epochs=4
-initial_effective_lrate=0.001
-final_effective_lrate=0.0001
-max_param_change=2.0
-final_layer_normalize_target=0.5
-num_jobs_initial=2
-num_jobs_final=12
-minibatch_size=128
-frames_per_eg=150,110,90
-remove_egs=true
-common_egs_dir=
-xent_regularize=0.1
-
-# End configuration section.
-echo "$0 $*"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-dir=${dir}${affix:+_$affix}_sp
-train_set=train_sp
-ali_dir=exp/tri5a_sp_ali
-treedir=exp/chain/tri6_7d_tree_sp
-lang=data/lang_chain
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
-
-if [ $stage -le 7 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat $ali_dir/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri5a exp/tri5a_sp_lats
-  rm exp/tri5a_sp_lats/fsts.*.gz # save space
-fi
-
-if [ $stage -le 8 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 9 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 10 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=43 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=625
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
-  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
-  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
-  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
-
-  ## adding the layers for chain branch
-  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 11 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate $final_effective_lrate \
-    --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires_online \
-    --tree-dir $treedir \
-    --lat-dir exp/tri5a_sp_lats \
-    --dir $dir  || exit 1;
-fi
-
-if [ $stage -le 12 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
-fi
-
-graph_dir=$dir/graph
-if [ $stage -le 13 ]; then
-  for test_set in dev test; do
-    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-      --nj 10 --cmd "$decode_cmd" \
-      --online-ivector-dir exp/nnet3/ivectors_$test_set \
-      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
-  done
-fi
-
-if [ $stage -le 14 ]; then
-  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
-    --add-pitch true \
-    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
-fi
-
-dir=${dir}_online
-if [ $stage -le 15 ]; then
-  for test_set in dev test; do
-    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-      --nj 10 --cmd "$decode_cmd" \
-      --config conf/decode.config \
-      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
-  done
-fi
-
-if [ $stage -le 16 ]; then
-  for test_set in dev test; do
-    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-      --nj 10 --cmd "$decode_cmd" --per-utt true \
-      --config conf/decode.config \
-      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
-  done
-fi
-
-exit;
diff --git a/egs/xbmu_amdo31/s5/local/download_and_untar.sh b/egs/xbmu_amdo31/s5/local/download_and_untar.sh
deleted file mode 100755
index 9c70836bf46..00000000000
--- a/egs/xbmu_amdo31/s5/local/download_and_untar.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
-#             2017  Xingyu Na
-# Apache 2.0
-
-remove_archive=false
-
-if [ "$1" == --remove-archive ]; then
-  remove_archive=true
-  shift
-fi
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
-  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
-  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
-  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
-fi
-
-data=$1
-url=$2
-part=$3
-
-if [ ! -d "$data" ]; then
-  echo "$0: no such directory $data"
-  exit 1;
-fi
-
-part_ok=false
-list="data_aishell resource_aishell"
-for x in $list; do
-  if [ "$part" == $x ]; then part_ok=true; fi
-done
-if ! $part_ok; then
-  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
-  exit 1;
-fi
-
-if [ -z "$url" ]; then
-  echo "$0: empty URL base."
-  exit 1;
-fi
-
-if [ -f $data/$part/.complete ]; then
-  echo "$0: data part $part was already successfully extracted, nothing to do."
-  exit 0;
-fi
-
-# sizes of the archive files in bytes.
-sizes="15582913665 1246920"
-
-if [ -f $data/$part.tgz ]; then
-  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
-  size_ok=false
-  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
-  if ! $size_ok; then
-    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
-    echo "does not equal the size of one of the archives."
-    rm $data/$part.tgz
-  else
-    echo "$data/$part.tgz exists and appears to be complete."
-  fi
-fi
-
-if [ ! -f $data/$part.tgz ]; then
-  if ! which wget >/dev/null; then
-    echo "$0: wget is not installed."
-    exit 1;
-  fi
-  full_url=$url/$part.tgz
-  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
-
-  cd $data
-  if ! wget --no-check-certificate $full_url; then
-    echo "$0: error executing wget $full_url"
-    exit 1;
-  fi
-fi
-
-cd $data
-
-if ! tar -xvzf $part.tgz; then
-  echo "$0: error un-tarring archive $data/$part.tgz"
-  exit 1;
-fi
-
-touch $data/$part/.complete
-
-if [ $part == "data_aishell" ]; then
-  cd $data/$part/wav
-  for wav in ./*.tar.gz; do
-    echo "Extracting wav from $wav"
-    tar -zxf $wav && rm $wav
-  done
-fi
-
-echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
-
-if $remove_archive; then
-  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
-  rm $data/$part.tgz
-fi
-
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
deleted file mode 100755
index 610774fb2a2..00000000000
--- a/egs/xbmu_amdo31/s5/local/nnet3/run_ivector_common.sh
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
-
-# This script is called from local/nnet3/run_tdnn.sh and
-# local/chain/run_tdnn.sh (and may eventually be called by more
-# scripts).  It contains the common feature preparation and
-# iVector-related parts of the script.  See those scripts for examples
-# of usage.
-
-stage=0
-train_set=train
-test_sets="dev test"
-gmm=tri5a
-online=false
-nnet3_affix=
-
-. ./cmd.sh
-. ./path.sh
-. utils/parse_options.sh
-
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_sp_ali
-
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
-  if [ ! -f $f ]; then
-    echo "$0: expected file $f to exist"
-    exit 1
-  fi
-done
-
-online_affix=
-if [ $online = true ]; then
-  online_affix=_online
-fi
-
-if [ $stage -le 1 ]; then
-  # Although the nnet will be trained by high resolution data, we still have to
-  # perturb the normal data to get the alignment _sp stands for speed-perturbed
-  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
-  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
-    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
-  steps/compute_cmvn_stats.sh data/${train_set}_sp \
-    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
-  utils/fix_data_dir.sh data/${train_set}_sp
-fi
-
-if [ $stage -le 2 ]; then
-  echo "$0: aligning with the perturbed low-resolution data"
-  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
-fi
-
-if [ $stage -le 3 ]; then
-  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
-  # this shows how you can split across multiple file-systems.
-  echo "$0: creating high-resolution MFCC features"
-  mfccdir=mfcc_perturbed_hires$online_affix
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
-  fi
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
-  done
-
-  # do volume-perturbation on the training data prior to extracting hires
-  # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
-    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
-    # create MFCC data dir without pitch to extract iVector
-    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
-  done
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: computing a subset of data to train the diagonal UBM."
-  # We'll use about a quarter of the data.
-  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
-  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
-
-  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
-  num_utts=$((num_utts_total/4))
-  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
-     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
-
-  echo "$0: computing a PCA transform from the hires data."
-  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
-      --splice-opts "--left-context=3 --right-context=3" \
-      --max-utts 10000 --subsample 2 \
-       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
-       exp/nnet3${nnet3_affix}/pca_transform
-
-  echo "$0: training the diagonal UBM."
-  # Use 512 Gaussians in the UBM.
-  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-    --num-frames 700000 \
-    --num-threads 5 \
-    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
-    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
-fi
-
-if [ $stage -le 5 ]; then
-  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
-  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
-  # 100.
-  echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
-     exp/nnet3${nnet3_affix}/extractor || exit 1;
-fi
-
-train_set=train_sp
-
-if [ $stage -le 6 ]; then
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
-  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
-  # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
-
-  # note, we don't encode the 'max2' in the name of the ivectordir even though
-  # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data, the utterance list is the same.
-
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/xbmu_amdo-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
-  fi
-
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
-  temp_data_root=${ivectordir}
-  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
-    exp/nnet3${nnet3_affix}/extractor $ivectordir
-
-  # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp).
-  for data in $test_sets; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 5 \
-      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
-      exp/nnet3${nnet3_affix}/ivectors_${data}
-  done
-fi
-
-exit 0
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh b/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
deleted file mode 120000
index 34499362831..00000000000
--- a/egs/xbmu_amdo31/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1 +0,0 @@
-tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
deleted file mode 100755
index f0cee7329ba..00000000000
--- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_1a.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-set -e
-
-stage=0
-train_stage=-10
-affix=
-common_egs_dir=
-
-# training options
-initial_effective_lrate=0.0015
-final_effective_lrate=0.00015
-num_epochs=4
-num_jobs_initial=1
-num_jobs_final=2
-remove_egs=true
-
-# feature options
-use_ivectors=true
-
-# End configuration section.
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-dir=exp/nnet3/tdnn_sp${affix:+_$affix}
-gmm_dir=exp/tri5a
-train_set=train_sp
-ali_dir=${gmm_dir}_sp_ali
-graph_dir=$gmm_dir/graph
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 7 ]; then
-  echo "$0: creating neural net configs";
-
-  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=43 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=850
-  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
-  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
-  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn6 dim=850
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_dnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate $final_effective_lrate \
-    --egs.dir "$common_egs_dir" \
-    --cleanup.remove-egs $remove_egs \
-    --cleanup.preserve-model-interval 500 \
-    --use-gpu true \
-    --feat-dir=data/${train_set}_hires \
-    --ali-dir $ali_dir \
-    --lang data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-fi
-
-if [ $stage -le 9 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information.
-  for decode_set in dev test; do
-    num_jobs=$(cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l)
-    decode_dir=${dir}/decode_$decode_set
-    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
-       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
-  done
-fi
-
-wait;
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
deleted file mode 100755
index 3f920315b77..00000000000
--- a/egs/xbmu_amdo31/s5/local/nnet3/tuning/run_tdnn_2a.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
-
-# In this script, the neural network in trained based on hires mfcc and online pitch.
-# The online pitch setup requires a online_pitch.conf in the conf dir for both training
-# and testing.
-
-set -e
-
-stage=0
-train_stage=-10
-affix=
-common_egs_dir=
-
-# training options
-initial_effective_lrate=0.0015
-final_effective_lrate=0.00015
-num_epochs=4
-num_jobs_initial=2
-num_jobs_final=12
-remove_egs=true
-
-# feature options
-use_ivectors=true
-
-# End configuration section.
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-dir=exp/nnet3/tdnn_sp${affix:+_$affix}
-gmm_dir=exp/tri5a
-train_set=train_sp
-ali_dir=${gmm_dir}_sp_ali
-graph_dir=$gmm_dir/graph
-
-local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
-
-if [ $stage -le 7 ]; then
-  echo "$0: creating neural net configs";
-
-  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=43 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=850
-  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
-  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
-  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn6 dim=850
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_dnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate $final_effective_lrate \
-    --egs.dir "$common_egs_dir" \
-    --cleanup.remove-egs $remove_egs \
-    --cleanup.preserve-model-interval 500 \
-    --use-gpu true \
-    --feat-dir=data/${train_set}_hires_online \
-    --ali-dir $ali_dir \
-    --lang data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-fi
-
-if [ $stage -le 9 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information.
-  for decode_set in dev test; do
-    num_jobs=$(cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l)
-    decode_dir=${dir}/decode_$decode_set
-    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
-       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
-  done
-fi
-
-if [ $stage -le 10 ]; then
-  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
-    --add-pitch true \
-    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
-fi
-
-if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from
-  # previous utterances of the same speaker.
-  for decode_set in dev test; do
-    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
-    decode_dir=${dir}_online/decode_$decode_set
-    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
-       --config conf/decode.config \
-       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
-  done
-fi
-
-if [ $stage -le 12 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information.
-  for decode_set in dev test; do
-    # num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-    num_jobs=$(< "data/${decode_set}_hires_online/utt2spk" cut -d' ' -f2 | sort -u | wc -l)
-    decode_dir=${dir}_online/decode_${decode_set}_per_utt
-    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
-       --config conf/decode.config --per-utt true \
-       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
-  done
-fi
-
-wait;
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/score.sh b/egs/xbmu_amdo31/s5/local/score.sh
deleted file mode 100755
index d283ceb68dc..00000000000
--- a/egs/xbmu_amdo31/s5/local/score.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-set -e -o pipefail
-set -x
-steps/score_kaldi.sh "$@"
-steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
-
-echo "$0: Done"
diff --git a/egs/xbmu_amdo31/s5/local/wer_hyp_filter b/egs/xbmu_amdo31/s5/local/wer_hyp_filter
deleted file mode 100755
index c6660e4efe1..00000000000
--- a/egs/xbmu_amdo31/s5/local/wer_hyp_filter
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env perl
-
-@filters=('<SPOKEN_NOISE>','<UNK>');
-
-foreach $w (@filters) {
-  $bad{$w} = 1;
-}
-
-while(<STDIN>) {
-  @A  = split(" ", $_);
-  $id = shift @A;
-  print "$id ";
-  foreach $a (@A) {
-    if (!defined $bad{$a}) {
-      print "$a ";
-    }
-  }
-  print "\n";
-}
diff --git a/egs/xbmu_amdo31/s5/local/wer_output_filter b/egs/xbmu_amdo31/s5/local/wer_output_filter
deleted file mode 100755
index aceeeec41b4..00000000000
--- a/egs/xbmu_amdo31/s5/local/wer_output_filter
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
-# Apache 2.0
-use utf8;
-
-use open qw(:encoding(utf8));
-binmode STDIN, ":utf8";
-binmode STDOUT, ":utf8";
-binmode STDERR, ":utf8";
-
-while (<>) {
-  @F = split " ";
-  print $F[0] . " "; 
-  foreach $s (@F[1..$#F]) {
-    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
-      print "";
-    } else {
-      print "$s"
-    }
-    print " ";
-  }
-  print "\n";
-}
-
-
diff --git a/egs/xbmu_amdo31/s5/local/wer_ref_filter b/egs/xbmu_amdo31/s5/local/wer_ref_filter
deleted file mode 100755
index c6660e4efe1..00000000000
--- a/egs/xbmu_amdo31/s5/local/wer_ref_filter
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env perl
-
-@filters=('<SPOKEN_NOISE>','<UNK>');
-
-foreach $w (@filters) {
-  $bad{$w} = 1;
-}
-
-while(<STDIN>) {
-  @A  = split(" ", $_);
-  $id = shift @A;
-  print "$id ";
-  foreach $a (@A) {
-    if (!defined $bad{$a}) {
-      print "$a ";
-    }
-  }
-  print "\n";
-}
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
deleted file mode 100755
index a3ba6fabaf4..00000000000
--- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_data_prep.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2017 Xingyu Na
-#           2021 Northwest Minzu University (senyan Li)
-#Apache 2.0
-
-. ./path.sh || exit 1;
-
-if [ $# != 2 ]; then
-  echo "Usage: $0 <audio-path> <text-path>"
-  echo " $0 /export/data/xbmu_amdo31/data/wav /export/data/xbmu_amdo31/data/transcript"
-  exit 1;
-fi
-
-tibetan_audio_dir=$1
-tibetan_text=$2/transcript_clean.txt
-
-train_dir=data/local/train
-dev_dir=data/local/dev
-test_dir=data/local/test
-tmp_dir=data/local/tmp
-
-mkdir -p $train_dir
-mkdir -p $dev_dir
-mkdir -p $test_dir
-mkdir -p $tmp_dir
-
-# data directory check
-if [ ! -d $tibetan_audio_dir ] || [ ! -f $tibetan_text ]; then
-  echo "Error: $0 requires two directory arguments"
-  exit 1;
-fi
-echo $tibetan_audio_dir
-# find wav audio file for train, dev and test resp.
-find $tibetan_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
-n=$(wc -l < "$tmp_dir/wav.flist")
-[ $n -ne 22630 ] && \
-  echo Warning: expected 141925 data data files, found $n
-
-grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
-grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
-grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
-
-rm -r $tmp_dir
-# Transcriptions preparation
-# cat $tibetan_text |head -10
-for dir in $train_dir $dev_dir $test_dir; do
-  echo Preparing $dir transcriptions
-  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}'  > $dir/utt.list
-  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}'> $dir/utt2spk_all
-  rm -f $dir/transcripts1.txt
-  while read -r line
-  do
-      line1=$(echo "$line" | cut -d '-' -f 2)
-      line2=$(grep -w $line1  $tibetan_text |cut -d " " -f 2-)
-      text=$line" "$line2
-      echo $text >>$dir/transcripts1.txt
-  done < "$dir/utt.list"
-  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
-  utils/filter_scp.pl -f 1 $dir/utt.list $dir/transcripts1.txt > $dir/transcripts.txt
-  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
-  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
-  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
-  sort -u $dir/transcripts.txt > $dir/text
-  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
-done
-
-mkdir -p data/train data/dev data/test
-
-for f in spk2utt utt2spk wav.scp text; do
-  cp $train_dir/$f data/train/$f || exit 1;
-  cp $dev_dir/$f data/dev/$f || exit 1;
-  cp $test_dir/$f data/test/$f || exit 1;
-done
-
-echo "$0: tibetan data preparation succeeded"
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
deleted file mode 100755
index 1e5537858ff..00000000000
--- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_prepare_dict.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2017 Xingyu Na
-# Apache 2.0
-
-# prepare dict resources
-
-. ./path.sh
-
-[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
-
-res_dir=$1
-dict_dir=data/local/dict
-mkdir -p $dict_dir
-cp $res_dir/lexicon.txt $dict_dir
-
-cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
-  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
-    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
-    foreach $l (values %q) {print "$l\n";}
- ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
-
-echo sil > $dict_dir/silence_phones.txt
-
-echo sil > $dict_dir/optional_silence.txt
-
-# No "extra questions" in the input to this setup, as we don't
-# have stress or tone
-
-cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
-cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
-  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
- >> $dict_dir/extra_questions.txt || exit 1;
-
-echo "$0: Tibetan dict preparation succeeded"
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh b/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
deleted file mode 100755
index 658f0e7bc15..00000000000
--- a/egs/xbmu_amdo31/s5/local/xbmu_amdo31_train_lms.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env bash
-
-
-# To be run from one directory above this script.
-. ./path.sh
-
-text=data/local/train/text
-lexicon=data/local/dict/lexicon.txt
-
-for f in "$text" "$lexicon"; do
-  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
-done
-
-# This script takes no arguments.  It assumes you have already run
-# aishell_data_prep.sh.
-# It takes as input the files
-# data/local/train/text
-# data/local/dict/lexicon.txt
-dir=data/local/lm
-mkdir -p $dir
-
-kaldi_lm=$(command -v train_lm.sh)
-if [ -z $kaldi_lm ]; then
-  echo "$0: train_lm.sh is not found. That might mean it's not installed"
-  echo "$0: or it is not added to PATH"
-  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
-  exit 1
-fi
-
-cleantext=$dir/text.no_oov
-
-cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
-  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
-  > $cleantext || exit 1;
-
-cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-   sort -nr > $dir/word.counts || exit 1;
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
-cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
-cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
-   || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
-cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-   || exit 1;
-
-train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz
-
-exit 0
-
-
-# From here is some commands to do a baseline with SRILM (assuming
-# you have it installed).
-heldout_sent=10000 # Don't change this if you want result to be comparable with
-    # kaldi_lm results
-sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
-mkdir -p $sdir
-cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
-  head -$heldout_sent > $sdir/heldout
-cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
-  tail -n +$heldout_sent > $sdir/train
-
-cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
-
-
-ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
-  -map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
-ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
-# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
-
-# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
-# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
-ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
-# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/xbmu_amdo31/s5/path.sh b/egs/xbmu_amdo31/s5/path.sh
deleted file mode 100755
index b70ffbfbb26..00000000000
--- a/egs/xbmu_amdo31/s5/path.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-export KALDI_ROOT=$(pwd)/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
-. $KALDI_ROOT/tools/config/common_path.sh
-export LC_ALL=C
diff --git a/egs/xbmu_amdo31/s5/run.sh b/egs/xbmu_amdo31/s5/run.sh
deleted file mode 100755
index 61b3e8f62d8..00000000000
--- a/egs/xbmu_amdo31/s5/run.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright Copyright 2021 Northwest Minzu University (Authors: Senyan Li)
-#           2017 Hui Bu
-#           2017 Jiayu Du
-#           2017 Xingyu Na
-#           2017 Bengu Wu
-#           2017 Hao Zheng
-# Apache 2.0
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-# Caution: some of the graph creation steps use quite a bit of memory, so you
-# should run this on a machine that has sufficient memory.
-
-# corpus directory and download URL
-data=/home1/lsy/kaldi/egs/xbmu_amdo31/s5/export/data
-data_url=www.openslr.org/resources/133
-
-. ./cmd.sh
-
-#local/download_and_untar.sh $data $data_url xbmu-amdo31 || exit 1;
-
-# Lexicon Preparation,
-local/xbmu_amdo31_prepare_dict.sh $data/xbmu_amdo31/resource || exit 1;
-
-# Data Preparation,
-local/xbmu_amdo31_data_prep.sh $data/xbmu_amdo31/data/wav $data/xbmu_amdo31/data/transcript || exit 1;
-
-# Phone Sets, questions, L compilation
-utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
-    "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
-
-# LM training
-local/xbmu_amdo31_train_lms.sh || exit 1;
-
-# G compilation, check LG composition
-utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
-    data/local/dict/lexicon.txt data/lang_test || exit 1;
-
-# Now make MFCC plus pitch features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-mfccdir=mfcc
-for x in train dev test; do
-  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-  utils/fix_data_dir.sh data/$x || exit 1;
-done
-
-# Train a monophone model on delta features.
-steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/mono || exit 1;
-
-# Decode with the monophone model.
-utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/mono/graph data/dev exp/mono/decode_dev
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/mono/graph data/test exp/mono/decode_test
-
-# Get alignments from monophone system.
-steps/align_si.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
-
-# Train the first triphone pass model tri1 on delta + delta-delta features.
-steps/train_deltas.sh --cmd "$train_cmd" \
- 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
-
-# decode tri1
-utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/tri1/graph data/dev exp/tri1/decode_dev
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/tri1/graph data/test exp/tri1/decode_test
-
-# align tri1
-steps/align_si.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-# train tri2 [delta+delta-deltas]
-steps/train_deltas.sh --cmd "$train_cmd" \
- 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
-
-# decode tri2
-utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/tri2/graph data/dev exp/tri2/decode_dev
-steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 5 \
-  exp/tri2/graph data/test exp/tri2/decode_test
-
-# Align training data with the tri2 model.
-steps/align_si.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
-
-# Train the second triphone pass model tri3a on LDA+MLLT features.
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
- 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
- 
-# Run a test decode with the tri3a model.
-utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
-steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-  exp/tri3a/graph data/dev exp/tri3a/decode_dev
-steps/decode.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-  exp/tri3a/graph data/test exp/tri3a/decode_test
-
-# align tri3a with fMLLR
-
-steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
-
-# Train the third triphone pass model tri4a on LDA+MLLT+SAT features.
-# From now on, we start building a more serious system with Speaker
-# Adaptive Training (SAT).
-steps/train_sat.sh --cmd "$train_cmd" \
-  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
-
-# decode tri4a
-utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-  exp/tri4a/graph data/dev exp/tri4a/decode_dev
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-  exp/tri4a/graph data/test exp/tri4a/decode_test
-  
-# align tri4a with fMLLR
-steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/tri4a exp/tri4a_ali
-
-# Train tri5a, which is LDA+MLLT+SAT
-# Building a larger SAT system. You can see the num-leaves is 3500 and tot-gauss is 100000
-
-steps/train_sat.sh --cmd "$train_cmd" \
-  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
-  
-# decode tri5a
-utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-   exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1;
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 5 --config conf/decode.config \
-   exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
-   
-# align tri5a with fMLLR
-steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
-  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
-
-# nnet3
-local/nnet3/run_tdnn.sh
-
-# chain
-local/chain/run_tdnn.sh
-
-# getting results (see RESULTS file)
-for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
-for x in exp/*/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
-
-exit 0;
diff --git a/egs/xbmu_amdo31/s5/steps b/egs/xbmu_amdo31/s5/steps
deleted file mode 120000
index 6e99bf5b5ad..00000000000
--- a/egs/xbmu_amdo31/s5/steps
+++ /dev/null
@@ -1 +0,0 @@
-../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/xbmu_amdo31/s5/utils b/egs/xbmu_amdo31/s5/utils
deleted file mode 120000
index b240885218f..00000000000
--- a/egs/xbmu_amdo31/s5/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../wsj/s5/utils
\ No newline at end of file
diff --git a/src/Makefile b/src/Makefile
index 5036d12b707..4d4efbc0172 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -34,12 +34,6 @@ SUBDIRS += $(CUDADECODER)
 endif
 endif
 
-ifeq ($(ROCM), true)
-ifeq ($(WITH_CUDADECODER), true)
-SUBDIRS += $(CUDADECODER)
-endif
-endif
-
 SUBDIRS_LIB = $(filter-out %bin, $(SUBDIRS))
 SUBDIRS_BIN = $(filter     %bin, $(SUBDIRS))
 
@@ -62,16 +56,14 @@ endif
 
 # Don't call rm -rf.
 rmlibdir:
-ifeq ($(KALDI_FLAVOR), dynamic)
 ifneq ($(KALDILIBDIR), )
-	-rm -f $(KALDILIBDIR)/*{.so,.a,.o,.dylib}
+	-rm -f $(KALDILIBDIR)/*{.so,.a,.o}
 	-rmdir 2>/dev/null $(KALDILIBDIR); true
 else
 # KALDILIBDIR might have been unset because of reconfigure. Do a best guess.
 	@echo "Something seems wrong. Please re-run configure."
 	@echo "I will continue but the cleanup might not be complete."
 endif
-endif
 
 kaldi.mk:
 	@echo "ERROR: kaldi.mk does not exist; run ./configure first.";
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 68ef224b5f5..31440edf3f9 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -76,7 +76,7 @@ int main() {
     kaldi::UnitTestError();
     KALDI_ASSERT(0); // should not happen.
     exit(1);
-  } catch (const kaldi::KaldiFatalError &e) {
+  } catch (kaldi::KaldiFatalError &e) {
     std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index 572cbb4effd..a9904a752cd 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -185,12 +185,12 @@ class MessageLogger {
 #define KALDI_ASSERT(cond)                                                     \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)(cond);                                                            \
+      (void)0;                                                                 \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_ASSERT(cond) (void)(cond)
+#define KALDI_ASSERT(cond) (void)0
 #endif
 
 // Some more expensive asserts only checked if this defined.
@@ -198,12 +198,12 @@ class MessageLogger {
 #define KALDI_PARANOID_ASSERT(cond)                                            \
   do {                                                                         \
     if (cond)                                                                  \
-      (void)(cond);                                                            \
+      (void)0;                                                                 \
     else                                                                       \
       ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
   } while (0)
 #else
-#define KALDI_PARANOID_ASSERT(cond) (void)(cond)
+#define KALDI_PARANOID_ASSERT(cond) (void)0
 #endif
 
 /***** THIRD-PARTY LOG-HANDLER *****/
diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index 6d96ecf2b75..7ebf4f85386 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -39,21 +39,9 @@ typedef float   BaseFloat;
 // we find in the future lacks stdint.h
 #include <stdint.h>
 
-#if OPENFST_VER >= 10800
-typedef int8_t   int8;
-typedef int16_t  int16;
-typedef int32_t  int32;
-typedef int64_t  int64;
-
-typedef uint8_t  uint8;
-typedef uint16_t uint16;
-typedef uint32_t uint32;
-typedef uint64_t uint64;
-typedef float    float32;
-typedef double   double64;
-#else
+// for discussion on what to do if you need compile kaldi
+// without OpenFST, see the bottom of this this file
 #include <fst/types.h>
-#endif
 
 namespace kaldi {
   using ::int16;
@@ -65,4 +53,23 @@ namespace kaldi {
   typedef float   float32;
   typedef double double64;
 }  // end namespace kaldi
+
+// In a theoretical case you decide compile Kaldi without the OpenFST
+// comment the previous namespace statement and uncomment the following
+/*
+namespace kaldi {
+  typedef int8_t   int8;
+  typedef int16_t  int16;
+  typedef int32_t  int32;
+  typedef int64_t  int64;
+
+  typedef uint8_t  uint8;
+  typedef uint16_t uint16;
+  typedef uint32_t uint32;
+  typedef uint64_t uint64;
+  typedef float    float32;
+  typedef double   double64;
+}  // end namespace kaldi
+*/
+
 #endif  // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc
index 08847579f85..a6db0fc0c9e 100644
--- a/src/bin/compute-gop.cc
+++ b/src/bin/compute-gop.cc
@@ -1,7 +1,6 @@
 // bin/compute-gop.cc
 
 // Copyright 2019  Junbo Zhang
-//           2024  Jiun-Ting Li (National Taiwan Normal University)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -108,14 +107,11 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Compute Goodness Of Pronunciation (GOP) from a matrix of "
         "probabilities (e.g. from nnet3-compute).\n"
-        "Usage:  compute-gop [options] <model> "
-        "<transition-alignments-respecifier> "
-        "<phoneme-alignments-rspecifier> "
+        "Usage:  compute-gop [options] <model> <alignments-rspecifier> "
         "<prob-matrix-rspecifier> <gop-wspecifier> "
-        "<phone-feature-wspecifier>\n"
+        "[<phone-feature-wspecifier>]\n"
         "e.g.:\n"
-        " nnet3-compute [args] | compute-gop 1.mdl ark:ali.1 ark:ali-phone.1 "
-        " ark:output.1.ark "
+        " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
         " ark:gop.1 ark:phone-feat.1\n";
 
     ParseOptions po(usage);
@@ -134,17 +130,16 @@ int main(int argc, char *argv[]) {
 
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 6) {
+    if (po.NumArgs() != 4 && po.NumArgs() != 5) {
       po.PrintUsage();
       exit(1);
     }
 
     std::string model_filename = po.GetArg(1),
-                transition_alignments_rspecifier = po.GetArg(2),
-                phoneme_alignments_rspecifier = po.GetArg(3),
-                prob_rspecifier = po.GetArg(4),
-                gop_wspecifier = po.GetArg(5),
-                feat_wspecifier = po.GetArg(6);
+                alignments_rspecifier = po.GetArg(2),
+                prob_rspecifier = po.GetArg(3),
+                gop_wspecifier = po.GetArg(4),
+                feat_wspecifier = po.GetArg(5);
 
     TransitionModel trans_model;
     {
@@ -179,8 +174,7 @@ int main(int argc, char *argv[]) {
       }
     }
 
-    RandomAccessInt32VectorReader phoneme_alignments_reader(phoneme_alignments_rspecifier);
-    RandomAccessInt32VectorReader transition_alignments_reader(transition_alignments_rspecifier);
+    RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
     SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
     PosteriorWriter gop_writer(gop_wspecifier);
     BaseFloatVectorWriter feat_writer(feat_wspecifier);
@@ -188,41 +182,25 @@ int main(int argc, char *argv[]) {
     int32 num_done = 0;
     for (; !prob_reader.Done(); prob_reader.Next()) {
       std::string key = prob_reader.Key();
-      if (!phoneme_alignments_reader.HasKey(key)) {
-        KALDI_WARN << "No phoneme alignment for utterance " << key;
+      if (!alignment_reader.HasKey(key)) {
+        KALDI_WARN << "No alignment for utterance " << key;
         continue;
       }
-      if (!transition_alignments_reader.HasKey(key)) {
-        KALDI_WARN << "No transition alignment for utterance " << key;
-        continue;
-      }
-      auto phoneme_alignment = phoneme_alignments_reader.Value(key);
-      auto transition_alignment = transition_alignments_reader.Value(key);
+      auto alignment = alignment_reader.Value(key);
       Matrix<BaseFloat> &probs = prob_reader.Value();
       if (log_applied) probs.ApplyExp();
 
-      std::vector<std::vector<int32> > split;
-      SplitToPhones(trans_model, transition_alignment, &split);
-
-      std::vector<int32> phone_boundary;
-      for (int32 i = 0; i < split.size(); i++) {
-        for (int32 j = 0; j < split[i].size(); j++) {
-          phone_boundary.push_back(i);
-        }
-      }
-
       Matrix<BaseFloat> lpps;
       ComputeLpps(probs, pdf2phones, &lpps);
 
-      int32 frame_num = phoneme_alignment.size();
-      if (phoneme_alignment.size() != probs.NumRows()) {
+      int32 frame_num = alignment.size();
+      if (alignment.size() != probs.NumRows()) {
         KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
         if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
       }
 
       KALDI_ASSERT(frame_num > 0);
-      int32 cur_phone_id = phoneme_alignment[0];
-      int32 cur_phone_pos = phone_boundary[0];
+      int32 cur_phone_id = alignment[0];
       int32 duration = 0;
       Vector<BaseFloat> phone_level_feat(1 + phone_num * 2);  // [phone LPPs LPRs]
       SubVector<BaseFloat> lpp_part(phone_level_feat, 1, phone_num);
@@ -242,9 +220,8 @@ int main(int argc, char *argv[]) {
         lpp_part.AddVec(1, frame_level_lpp);
         duration++;
 
-        int32 next_phone_id = (i < frame_num - 1) ? phoneme_alignment[i + 1]: -1;
-        int32 next_phone_pos = (i < frame_num - 1) ? phone_boundary[i + 1]: -1;
-        if (next_phone_pos != cur_phone_pos) {
+        int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1]: -1;
+        if (next_phone_id != cur_phone_id) {
           int32 phone_id = phone_map.empty() ? cur_phone_id : phone_map[cur_phone_id];
 
           // The current phone's feature have been ready
@@ -271,7 +248,6 @@ int main(int argc, char *argv[]) {
           duration = 0;
         }
         cur_phone_id = next_phone_id;
-        cur_phone_pos = next_phone_pos;
       }
 
       // Write GOPs and the GOP-based features
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
index 6aee0c5ce78..3c93dfd0d39 100644
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@@ -49,7 +49,7 @@ int32 TypeOneUsage(const ParseOptions &po,
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0;
+      n_success = 0, n_missing = 0, n_other_errors = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -78,6 +78,7 @@ int32 TypeOneUsage(const ParseOptions &po,
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
+          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
@@ -123,7 +124,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_matrices = 0,
-      n_success = 0, n_missing = 0;
+      n_success = 0, n_missing = 0, n_other_errors = 0;
 
   for (; !matrix_reader1.Done(); matrix_reader1.Next()) {
     std::string key = matrix_reader1.Key();
@@ -150,6 +151,7 @@ int32 TypeOneUsageAverage(const ParseOptions &po) {
                      << matrix_in_fns[i] << " vs " << matrix_out.NumRows()
                      << " by " << matrix_out.NumCols()
                      << " primary matrix, rspecifier:" << matrix_in_fn1;
+          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No matrix found for utterance " << key << " for "
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 535c18365ed..22d4d92055d 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -172,8 +172,7 @@ int main(int argc, char *argv[]) {
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
-          printer_print(std::cerr, fstprinter, "standard error");
-          //fstprinter.Print(&std::cerr, "standard error");
+          fstprinter.Print(std::cerr, "standard error");
           KALDI_LOG << "phone sequence is: ";
           for (size_t i = 0; i < phones.size(); i++)
             std::cerr << phones[i] << ' ';
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index d03bf671245..3e622cafdc7 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -52,7 +52,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
   }
 
   int32 n_utts = 0, n_total_vectors = 0,
-      n_success = 0, n_missing = 0;
+      n_success = 0, n_missing = 0, n_other_errors = 0;
 
   for (; !vector_reader1.Done(); vector_reader1.Next()) {
     std::string key = vector_reader1.Key();
@@ -75,6 +75,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
                      << "system " << (i + 2) << ", rspecifier: "
                      << vector_in_fns[i] << " vs " << vector_out.Dim()
                      << " primary vector, rspecifier:" << vector_in_fn1;
+          n_other_errors++;
         }
       } else {
         KALDI_WARN << "No vector found for utterance " << key << " for "
diff --git a/src/chain/Makefile b/src/chain/Makefile
index dbe6c38709f..fbad28f7de6 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -10,7 +10,7 @@ TESTFILES = chain-supervision-test language-model-test
 OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o \
           chain-generic-numerator.o
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
   OBJFILES += chain-kernels.o
 endif
 
@@ -28,14 +28,7 @@ ifeq ($(CUDA), true)
 endif
 
 # Implicit rule for kernel compilation,
-ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
-endif
-ifeq ($(ROCM), true)
-%.o : %.cu
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-endif
-
 
 include ../makefiles/default_rules.mk
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index cbe15740872..ae350aefea7 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -312,7 +312,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
     // previously an acceptor, so we project, i.e. copy the ilabels to the
     // olabels
     AddSubsequentialLoop(subsequential_symbol, &phone_lm);
-    fst::Project(&phone_lm, fst::PROJECT_INPUT);
+    fst::Project(&phone_lm, fst::ProjectType::INPUT);
   }
   std::vector<int32> disambig_syms;  // empty list of disambiguation symbols.
 
@@ -330,7 +330,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   // at this point, context_dep_lm will have indexes into 'ilabels' as its
   // input symbol (representing context-dependent phones), and phones on its
   // output.  We don't need the phones, so we'll project.
-  fst::Project(&context_dep_lm, fst::PROJECT_INPUT);
+  fst::Project(&context_dep_lm, fst::ProjectType::INPUT);
 
   KALDI_LOG << "Number of states and arcs in context-dependent LM FST is "
             << context_dep_lm.NumStates() << " and " << NumArcs(context_dep_lm);
@@ -365,7 +365,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   // context-dependent phones (indexes into IlabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
   // only the transition-ids.
-  fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+  fst::Project(&transition_id_fst, fst::ProjectType::INPUT);
 
   MapFstToPdfIdsPlusOne(trans_model, &transition_id_fst);
   KALDI_LOG << "Number of states and arcs in transition-id FST is "
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index 48c80cc8d92..f5814d7c11c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -22,10 +22,6 @@
 #define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
 #include "chain/chain-datastruct.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#endif
-
 #if HAVE_CUDA == 1
 extern "C" {
 
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index ad6691fc895..a63944f0012 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -20,11 +20,6 @@
 #include <cfloat>
 #include "chain/chain-kernels-ansi.h"
 
-#ifdef __IS_HIP_COMPILE__
-#define __CUDA_ARCH__ 800
-#include <hip/hip_runtime.h>
-#endif
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
 #error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
          configure with --use-cuda=no (this will disable the use of GPU).
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index b29000a448c..9c009c6c0da 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -306,7 +306,7 @@ bool ProtoSupervisionToSupervision(
     // previously an acceptor, so we project, i.e. copy the ilabels to the
     // olabels
     AddSubsequentialLoop(subsequential_symbol, &phone_fst);
-    fst::Project(&phone_fst, fst::PROJECT_INPUT);
+    fst::Project(&phone_fst, fst::ProjectType::INPUT);
   }
 
   // inv_cfst will be expanded on the fly, as needed.
@@ -325,7 +325,7 @@ bool ProtoSupervisionToSupervision(
   // 'inv_cfst.IlabelInfo()' as its input symbol (representing context-dependent
   // phones), and phones on its output.  We don't need the phones, so we'll
   // project.
-  fst::Project(&context_dep_fst, fst::PROJECT_INPUT);
+  fst::Project(&context_dep_fst, fst::ProjectType::INPUT);
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on input side of
                                       // H -- will be empty, as there were no
@@ -364,7 +364,7 @@ bool ProtoSupervisionToSupervision(
   // context-dependent phones (indexes into ILabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
   // only the transition-ids.
-  fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+  fst::Project(&transition_id_fst, fst::ProjectType::INPUT);
   if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) {
     // remove epsilons, if there are any.
     fst::RmEpsilon(&transition_id_fst);
@@ -385,7 +385,7 @@ bool ProtoSupervisionToSupervision(
   if (convert_to_pdfs) {
     // at this point supervision->fst will have pdf-ids plus one as the olabels,
     // but still transition-ids as the ilabels.  Copy olabels to ilabels.
-    fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT);
+    fst::Project(&(supervision->fst), fst::ProjectType::OUTPUT);
   }
 
   KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0);
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 60a2645b31b..0117fe2200f 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -347,7 +347,7 @@ int main(int argc, char *argv[]) {
                                             // not configurable for now.
     exclude_names.push_back(std::string("ivector"));
 
-    int64 num_read = 0, num_written = 0;
+    int64 num_read = 0, num_written = 0, num_err = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       const std::string &key = example_reader.Key();
       NnetChainExample &eg = example_reader.Value();
@@ -361,6 +361,7 @@ int main(int argc, char *argv[]) {
         BaseFloat weight = 1.0;
         if (!egs_weight_reader.HasKey(key)) {
           KALDI_WARN << "No weight for example key " << key;
+          num_err++;
           continue;
         }
         weight = egs_weight_reader.Value(key);
@@ -370,6 +371,7 @@ int main(int argc, char *argv[]) {
       if (!eg_output_name_rspecifier.empty()) {
         if (!output_name_reader.HasKey(key)) {
           KALDI_WARN << "No new output-name for example key " << key;
+          num_err++;
           continue;
         }
         std::string new_output_name = output_name_reader.Value(key);
diff --git a/src/configure b/src/configure
index 1dc564e1030..fc3aee6808d 100755
--- a/src/configure
+++ b/src/configure
@@ -17,7 +17,7 @@
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
 #                             # version of kaldi even on CUDA-enabled machine.
-# ./configure --use-cuda=yes --cudatk-dir=/usr/local/cuda/ --cuda-arch=-arch=sm_70
+# ./configure --use-cuda --cudatk-dir=/usr/local/cuda/ --cuda-arch=-arch=sm_70
 #        # Use cuda in /usr/local/cuda and set the arch to sm_70
 # ./configure --static --fst-root=/opt/cross/armv8hf \
 #   --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
@@ -39,7 +39,7 @@
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=15
+CONFIGURE_VERSION=14
 
 # We support bash version 3.2 (Macs still ship with this version as of 2019)
 # and above.
@@ -69,14 +69,11 @@ Configuration options:
   --version             Display the version of 'configure' and exit
   --static              Build and link against static libraries [default=no]
   --shared              Build and link against shared libraries [default=no]
-  --use-cuda            Build with CUDA [default=no]
+  --use-cuda            Build with CUDA [default=yes]
   --with-cudadecoder    Build with CUDA decoder [default=yes]
   --cudatk-dir=DIR      CUDA toolkit directory
   --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
          https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
-  --use-rocm            Build with ROCm
-  --rocm-dir=DIR        ROCM directory
-  --rocm-targets=TGTS   Comma separated list of GPU targets to target through ROCm
   --debug-level=N       Use assertion level 0 (disabled), 1, or 2 [default=1]
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
@@ -251,71 +248,6 @@ function check_for_slow_expf {
   fi
 }
 
-# ROCM is used only in selected directories including src/cudamatrix, src/nnet*
-# and src/chain*. It is used to accelerate the neural network training.
-# The rest of Kaldi runs on CPUs.
-
-function configure_rocm {
-  # Check for ROCM in the system
-  if [ ! -d "$ROCMDIR" ]; then
-    for base in $ROCM_PATH /opt/rocm /usr/local/rocm /usr/; do
-      if [ -f $base/bin/hipcc ] && [ -f $base/bin/hipconfig ]; then
-        ROCMDIR=$base
-        break
-      fi
-    done
-  fi
-
-  if [ -d "$ROCMDIR" ]; then
-    if [ ! -f $ROCMDIR/bin/hipcc ]; then
-      failure "Cannnot find hipcc and hipconfig in ROCm directory $ROCMDIR"
-    fi
-  fi
-  echo "Using ROCm $ROCMDIR (hipcc compiler and runtime libraries)"
-  echo >> kaldi.mk
-  echo "# ROCm configuration" >> kaldi.mk
-  echo >> kaldi.mk
-  echo IS_GPU_BUILD = true >> kaldi.mk
-  echo ROCM = true >> kaldi.mk
-  echo "ROCMDIR = $ROCMDIR" >> kaldi.mk
-  echo "HIPCC = $ROCMDIR/bin/hipcc" >> kaldi.mk 
-
-  echo "CUDA_ARCH = " >> kaldi.mk
-  echo "ROCM_ARCH_FLAGS = " >> kaldi.mk
-  for i in ${ROCM_TARGETS//,/ } ; do
-    echo "Targetting ROCm arch $i"
-    echo "ROCM_ARCH_FLAGS += --offload-arch=$i" >> kaldi.mk
-  done
-  
-  echo "HOST_ARCH = `uname -m`" >> kaldi.mk
-  echo >> kaldi.mk
-  
-  ROCM_MAJOR_VERSION=$(hipconfig -v | cut -d. -f1)
-  echo "ROCM_MAJOR_VERSION = $ROCM_MAJOR_VERSION" >> kaldi.mk
-  ROCM_MINOR_VERSION=$(hipconfig -v | cut -d. -f2)
-  echo "ROCM_MINOR_VERSION = $ROCM_MINOR_VERSION" >> kaldi.mk
-  
-  # Only ROCm 5.2+ is supported.
-  if [ $ROCM_MAJOR_VERSION -eq 5 ] && [ $ROCM_MINOR_VERSION -lt 2 ] || [ $ROCM_MAJOR_VERSION -lt 5 ] ; then
-    echo "\
-WARNING: ROCm $ROCM_MAJOR_VERSION.$ROCM_MINOR_VERSION found but ROCm 5.2 or above is required."
-    exit 1;
-  fi
-  
-  # 64bit/32bit? Not Linux? We do not support cross compilation with ROCm so, 
-  # use direct calls to uname -m here
-  if [ "`uname -m`" == "x86_64" ] && [ "`uname`" == "Linux" ] ; then
-    cat makefiles/hip_64bit.mk >> kaldi.mk
-  else
-    echo "\
-WARNING: ROCm will not be used!
-         ROCm is only supported with 64-bit Linux builds."
-    exit 1;
-  fi
-}
-
-
-
 # CUDA is used only in selected directories including src/cudamatrix, src/nnet*
 # and src/chain*. It is used to accelerate the neural network training.
 # The rest of Kaldi runs on CPUs.
@@ -351,7 +283,6 @@ Either your CUDA is too new or too old."
       GCC_VER=$($CXX -dumpversion)
       GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
       case $CUDA_VERSION in
-        # Update this list by consulting https://gist.github.com/ax3l/9489132
         # Disabling CUDA 7 and CUDA 8 because we now use C++14 to compile CUDA
         # code. It is still possible to use those cuda versions by switching
         # back to C++11 in src/makefiles/cuda_64bit.mk and use CUB <= 1.8.0.
@@ -386,23 +317,20 @@ Either your CUDA is too new or too old."
         11_*)
           MIN_UNSUPPORTED_GCC_VER="12.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=120000
-          CUSOLVER=true
-        ;;
+          ;;
         12_*)
-          MIN_UNSUPPORTED_GCC_VER="12.3"
-          MIN_UNSUPPORTED_GCC_VER_NUM=123000
-          CUSOLVER=true
-        ;;
+          MIN_UNSUPPORTED_GCC_VER="12.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=120000
+          ;;
         *)
           failure "Unsupported CUDA version ${CUDA_VERSION}.
 Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
  output of either 'nvcc -h' or 'ptxas -h'."
           ;;
       esac
-      if [ $GCC_VER_NUM -ge $MIN_UNSUPPORTED_GCC_VER_NUM ]; then
+      (( GCC_VER_NUM < MIN_UNSUPPORTED_GCC_VER_NUM )) ||
         failure "CUDA $CUDA_VERSION does not support $CXX (g++-$GCC_VER).\
  Only versions strictly older than $MIN_UNSUPPORTED_GCC_VER are supported."
-      fi
 
       case $CUDA_VERSION in
         [1-8]_* | 9_0) CUSOLVER=false ;;
@@ -421,7 +349,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
             10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
             11_0) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80" ;;
             11_*) CUDA_ARCH="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86" ;;
-            12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;;
+            12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90" ;;
             *) failure \
                  "Unsupported CUDA version ${CUDA_VERSION}. Please open an" \
                  "issue at https://github.com/kaldi-asr/kaldi/issues and" \
@@ -433,8 +361,7 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
             #7_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53" ;;
             #8_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62" ;;
             9_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62" ;;
-            10_*|11_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72" ;;
-            12_*) CUDA_ARCH="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90" ;;
+            10_*|11_*|12_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72" ;;
             *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
           esac
         ;;
@@ -449,7 +376,6 @@ Please open an issue at https://github.com/kaldi-asr/kaldi/issues and include\
     echo "# CUDA configuration" >> kaldi.mk
     echo >> kaldi.mk
 
-    echo IS_GPU_BUILD = true >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
@@ -681,8 +607,7 @@ ENV_LDLIBS=$LDLIBS
 debug_level=1
 double_precision=false
 dynamic_kaldi=false
-use_cuda=false
-use_rocm=false
+use_cuda=true
 with_cudadecoder=true
 static_fst=false
 static_math=false
@@ -731,11 +656,8 @@ do
   --atlas-root=*)
     GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
-  --use-rocm)
-    use_rocm=true;
-    shift ;;
-  --use-rocm=no)
-    use_rocm=false;
+  --use-cuda)
+    use_cuda=true;
     shift ;;
   --use-cuda=yes)
     use_cuda=true;
@@ -812,13 +734,6 @@ do
   --mathlib=*)
     GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
-  --rocm-dir=*)
-    # ROCM is used in src/cudamatrix and src/nnet{,bin} only.
-    GetSwitchExistingPathOrDie ROCMDIR "$1"
-    shift ;;
-  --rocm-targets=*)
-    GetSwitchValueOrDie ROCM_TARGETS "$1"
-    shift ;;
   --cudatk-dir=*)
     # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
     GetSwitchExistingPathOrDie CUDATKDIR "$1"
@@ -894,12 +809,6 @@ if is_set $HOST; then
     fi
   else
     TARGET_ARCH="$HOST"
-    if ! $static_fst || ! $static_math || $dynamic_kaldi; then
-      echo "WARNING: Dynamic libraries are not currently supported when compiling to WASM. Overriding --static, --static-math, and --static-fst."
-    fi
-    dynamic_kaldi=false
-    static_math=true
-    static_fst=true
   fi
 
   HOST_CXX="$HOST-c++"
@@ -930,7 +839,7 @@ auto_lib=             # Deduced lib name, used when $MATHLIB is not set.
 
 # Validate the (optionally) provided MATHLIB value.
 case $MATHLIB in
-  ''|ATLAS|CLAPACK|MKL|OPENBLAS) : ;;
+  ''|ATLAS|CLAPACK|MKL|OPENBLAS|OPENBLAS_CLAPACK) : ;;
   *) failure "Unknown --mathlib='${MATHLIB}'. Supported libs: ATLAS CLAPACK MKL OPENBLAS" ;;
 esac
 
@@ -1031,14 +940,6 @@ OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"
 if [ $OPENFST_VER_NUM -lt 10600 ]; then
   failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
 fi
-
-if [ $OPENFST_VER_NUM -lt 10800 ]; then
-  echo "CXXLANGVERSION = c++14"
-else
-  echo "CXXLANGVERSION = c++17"
-fi >> kaldi.mk
-
-echo "OPENFSTVER = $OPENFST_VER_NUM" >> kaldi.mk
 echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
 if $static_fst ; then
   OPENFSTLIBS="$FSTROOT/lib/libfst.a"
@@ -1072,11 +973,7 @@ if $use_cuda; then
    fi
    echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
 else
-   if $use_rocm; then
-     echo "WITH_CUDADECODER = $with_cudadecoder" >> kaldi.mk
-   else
-     echo "WITH_CUDADECODER = false" >> kaldi.mk
-   fi
+   echo "WITH_CUDADECODER = false" >> kaldi.mk
 fi
 echo >> kaldi.mk
 
@@ -1165,8 +1062,6 @@ elif [ "`uname`" == "Darwin" ]; then
       cat makefiles/darwin_clapack.mk >> kaldi.mk
       echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
       echo "Successfully configured for Darwin with CLAPACK libs from $CLAPACKROOT"
-    elif [ "$(uname -m)" == "arm64" ]; then
-      cat makefiles/darwin_arm64.mk >> kaldi.mk
     else
       cat makefiles/darwin.mk >> kaldi.mk
     fi
@@ -1366,6 +1261,14 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
 ** You can also use other matrix algebra libraries. For information, see:
 **   http://kaldi-asr.org/doc/matrixwrap.html"
     fi
+    if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
+      OPENBLASLIBDIR=$OPENBLASROOT/lib
+    elif [ -f $OPENBLASROOT/lib64/libopenblas.so ]; then
+      # in REDHAT/CentOS package installs, the library is located here
+      OPENBLASLIBDIR=$OPENBLASROOT/lib64
+    else
+      failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
+    fi
     if [ -f $OPENBLASROOT/include/cblas.h ] ; then
       OPENBLASINCDIR=$OPENBLASROOT/include
     elif [ -f $OPENBLASROOT/include/openblas/cblas.h ] ; then
@@ -1379,35 +1282,75 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
       echo "** if it is a package-based install)."
       OPENBLASINCDIR="/usr/include"
     fi
+    echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
+    # TODO(kkm): Probably, OpenBLAS required libgfortran.so.3 at some point, but
+    # no longer does. *My* linker does not complain about a missing library, but
+    # is it safe to keep the reference if no longer required? Try to figure out
+    # how long ago the dependency was dropped.
     if $static_math; then
-      if [ -f $OPENBLASROOT/lib/libopenblas.a ]; then
-        OPENBLASLIBDIR=$OPENBLASROOT/lib
-      else
-        failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.a"
-      fi
       echo "Configuring static OpenBlas since --static-math=yes"
-      OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a"
-      # No Fortran for OpenBLAS
-      if [[ "$HOST" != WASM ]]; then
-        OPENBLASLIBS+="-lgfortran"
-      fi
+      OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a -lgfortran"
     else
-      if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
-        OPENBLASLIBDIR=$OPENBLASROOT/lib
-      elif [ -f $OPENBLASROOT/lib64/libopenblas.so ]; then
-        # in REDHAT/CentOS package installs, the library is located here
-        OPENBLASLIBDIR=$OPENBLASROOT/lib64
-      else
-        failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
-      fi
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
       OPENBLASLIBS="-L$OPENBLASLIBDIR -lopenblas -lgfortran -Wl,-rpath=$OPENBLASLIBDIR"
     fi
+    echo "OPENBLASINC = $OPENBLASINCDIR" >> kaldi.mk
+    echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
+    echo >> kaldi.mk
+    case $TARGET_ARCH in
+      aarch64*) cat makefiles/linux_openblas_aarch64.mk ;;
+      arm*)     cat makefiles/linux_openblas_arm.mk ;;
+      ppc64le)  cat makefiles/linux_openblas_ppc64le.mk ;;
+      riscv64)  cat makefiles/linux_openblas_riscv64.mk ;;
+      *)        cat makefiles/linux_openblas.mk ;;
+    esac >> kaldi.mk
+
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
+  elif [ "$MATHLIB" == "OPENBLAS_CLAPACK" ]; then
+    if [[ ! $OPENBLASROOT ]]; then
+      # Either the user specified --mathlib=OPENBLAS or we've autodetected the
+      # system where OpenBLAS is the preferred option (the parser for
+      # --openblas-root fails fatally if the path does not exist, so we trust
+      # that if set, the variable contains the existing path, converted to
+      # absolute form).
+      OPENBLASROOT="$(rel2abs ../tools/OpenBLAS/install)" ||
+        Die "OpenBLAS not found in '../tools/OpenBLAS/install'.
+** This is the only place we look for it. The best option is to build OpenBLAS
+** tuned for your system and CPU. To do that, run the following commands:
+**
+**   cd ../tools; extras/install_openblas.sh
+**
+** Another option is to specify the location of existing OpenBLAS directory
+** with the switch '--openblas-root='. However, even if a package is provided
+** for your system, the packaged version is almost always significantly slower
+** and often older than the above commands can fetch and build.
+**
+** You can also use other matrix algebra libraries. For information, see:
+**   http://kaldi-asr.org/doc/matrixwrap.html"
+    fi
+    if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
+      OPENBLASLIBDIR=$OPENBLASROOT/lib
+    elif [ -f $OPENBLASROOT/lib64/libopenblas.so ]; then
+      # in REDHAT/CentOS package installs, the library is located here
+      OPENBLASLIBDIR=$OPENBLASROOT/lib64
+    else
+      failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
+    fi
+    if [ -f $OPENBLASROOT/include/cblas.h ] ; then
+      OPENBLASINCDIR=$OPENBLASROOT/include
+    elif [ -f $OPENBLASROOT/include/openblas/cblas.h ] ; then
+      # in REDHAT/CentOS/Ubuntu package installs, the includes are located here
+      OPENBLASINCDIR=$OPENBLASROOT/include/openblas
+    else
+      echo "$0: ***** Using OpenBLAS from $OPENBLASROOT but cblas.h is not found. "
+      echo "** Assuming openblas is aleady in a default include path, but"
+      echo "** if you get compilation messages about not finding files like cblas.h,"
+      echo "** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
+      echo "** if it is a package-based install)."
+      OPENBLASINCDIR="/usr/include"
+    fi
     echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
-    # TODO(kkm): Probably, OpenBLAS required libgfortran.so.3 at some point, but
-    # no longer does. *My* linker does not complain about a missing library, but
-    # is it safe to keep the reference if no longer required? Try to figure out
-    # how long ago the dependency was dropped.
+    OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a -l:libblas.a -l:liblapack.a -l:libf2c.a"
     echo "OPENBLASINC = $OPENBLASINCDIR" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo >> kaldi.mk
@@ -1415,15 +1358,18 @@ or try another math library, e.g. --mathlib=OPENBLAS (Kaldi may be slower)."
       aarch64*) cat makefiles/linux_openblas_aarch64.mk ;;
       arm*)     cat makefiles/linux_openblas_arm.mk ;;
       ppc64le)  cat makefiles/linux_openblas_ppc64le.mk ;;
+      riscv64)  cat makefiles/linux_openblas_riscv64.mk ;;
       *)        cat makefiles/linux_openblas.mk ;;
     esac >> kaldi.mk
 
+    echo >> kaldi.mk
+    echo "CXXFLAGS += -DUSE_KALDI_SVD" >> kaldi.mk
+
     echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
   $use_cuda && configure_cuda
-  $use_rocm && configure_rocm
   linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
index a7972f1831d..e2569e89ab7 100644
--- a/src/cudadecoder/Makefile
+++ b/src/cudadecoder/Makefile
@@ -3,15 +3,13 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
-endif
 
 TESTFILES =
 
@@ -36,14 +34,8 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
-ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
-endif
-ifeq ($(ROCM), true)
-%.o : %.cu
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-endif
 
 else
 all:
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu
index 429d9f72326..f02a78ed1af 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.cu
+++ b/src/cudadecoder/batched-static-nnet3-kernels.cu
@@ -17,11 +17,6 @@
 
 #include "cudadecoder/batched-static-nnet3-kernels.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include "hip/hip_runtime.h"
-#include "hipify.h"
-#endif
-
 #include <stdio.h>
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
index fec2470a9db..45064e15071 100644
--- a/src/cudadecoder/batched-static-nnet3-kernels.h
+++ b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -17,13 +17,7 @@
 
 #if HAVE_CUDA == 1
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
-#endif
 #include "base/kaldi-types.h"
 
 #ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index bec20cb9e07..0b75e85870e 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -21,13 +21,7 @@
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include <mutex>
 #include <numeric>
@@ -115,7 +109,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::AllocateAndInitializeData(
   // Feature extraction
   if (config_.use_gpu_feature_extraction) {
     gpu_feature_pipeline_.reset(new OnlineBatchedFeaturePipelineCuda(
-        config_.feature_opts, samples_per_chunk_, config_.max_batch_size,
+        feature_info_, samples_per_chunk_, config_.max_batch_size,
         num_channels_));
   } else {
     feature_pipelines_.resize(num_channels_);
@@ -130,7 +124,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::AllocateAndInitializeData(
         thread_pool_.get(), config_.num_decoder_copy_threads);
   }
 
-  decoder_frame_shift_seconds_ = feature_info_->FrameShiftInSeconds() *
+  decoder_frame_shift_seconds_ = feature_info_.FrameShiftInSeconds() *
                                  config_.compute_opts.frame_subsampling_factor;
   cuda_decoder_->SetOutputFrameShiftInSeconds(decoder_frame_shift_seconds_);
 
@@ -236,7 +230,7 @@ bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
   if (!config_.use_gpu_feature_extraction) {
     KALDI_ASSERT(!feature_pipelines_[ichannel]);
     feature_pipelines_[ichannel].reset(
-        new OnlineNnet2FeaturePipeline(*feature_info_));
+        new OnlineNnet2FeaturePipeline(feature_info_));
   }
 
   channels_info_[ichannel].Reset();
@@ -699,16 +693,12 @@ void BatchedThreadedNnet3CudaOnlinePipeline::RunDecoder(
 }
 
 void BatchedThreadedNnet3CudaOnlinePipeline::ReadParametersFromModel() {
-  feature_info_.reset(new OnlineNnet2FeaturePipelineInfo(config_.feature_opts));
-  feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
-  feature_info_->ivector_extractor_info.greedy_ivector_extractor = true;
-
-  OnlineNnet2FeaturePipeline feature(*feature_info_);
+  OnlineNnet2FeaturePipeline feature(feature_info_);
   use_ivectors_ = (feature.IvectorFeature() != NULL);
   input_dim_ = feature.InputFeature()->Dim();
   if (use_ivectors_) ivector_dim_ = feature.IvectorFeature()->Dim();
-  model_frequency_ = feature_info_->GetSamplingFrequency();
-  BaseFloat frame_shift_seconds = feature_info_->FrameShiftInSeconds();
+  model_frequency_ = feature_info_.GetSamplingFrequency();
+  BaseFloat frame_shift_seconds = feature_info_.FrameShiftInSeconds();
   input_frames_per_chunk_ = config_.compute_opts.frames_per_chunk;
   seconds_per_chunk_ = input_frames_per_chunk_ * frame_shift_seconds;
   int32 samp_per_frame =
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
index 6608aa79dd8..fb89a5f6087 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
@@ -89,7 +89,6 @@ struct BatchedThreadedNnet3CudaOnlinePipelineConfig {
         "reset-on-endpoint", &reset_on_endpoint,
         "Reset a decoder channel when endpoint detected. Do not close stream");
 
-    feature_opts.Register(po);
     decoder_opts.Register(po);
     det_opts.Register(po);
     compute_opts.Register(po);
@@ -102,7 +101,6 @@ struct BatchedThreadedNnet3CudaOnlinePipelineConfig {
   bool use_gpu_feature_extraction;
   bool reset_on_endpoint;
 
-  OnlineNnet2FeaturePipelineConfig feature_opts;
   CudaDecoderConfig decoder_opts;
   fst::DeterminizeLatticePhonePrunedOptions det_opts;
   nnet3::NnetSimpleComputationOptions compute_opts;
@@ -132,12 +130,14 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
 
   BatchedThreadedNnet3CudaOnlinePipeline(
       const BatchedThreadedNnet3CudaOnlinePipelineConfig &config,
+      OnlineNnet2FeaturePipelineInfo &feature_info,
       const fst::Fst<fst::StdArc> &decode_fst,
       const nnet3::AmNnetSimple &am_nnet, const TransitionModel &trans_model)
       : config_(config),
         max_batch_size_(config.max_batch_size),
 	num_channels_(std::max(max_batch_size_ * KALDI_CUDA_DECODER_MIN_NCHANNELS_FACTOR, config_.num_channels)),
         channels_info_(num_channels_),
+        feature_info_(feature_info),
         trans_model_(&trans_model),
         am_nnet_(&am_nnet),
         available_channels_(num_channels_),
@@ -388,10 +388,12 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   int32 num_channels_;
 
   std::vector<ChannelInfo> channels_info_;
+
+  // Features
+  OnlineNnet2FeaturePipelineInfo &feature_info_;
   // Models
   const TransitionModel *trans_model_;
   const nnet3::AmNnetSimple *am_nnet_;
-  std::unique_ptr<OnlineNnet2FeaturePipelineInfo> feature_info_;
 
   // Decoder channels currently available, w/ mutex
   std::vector<int32> available_channels_;
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
index 32d7ac40e12..89e93e5d98c 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -26,13 +26,7 @@
 
 #include <memory>
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-fst.h"
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
index 4b30c568e73..78966e181e9 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -23,13 +23,7 @@
 
 #include <atomic>
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 namespace kaldi {
 namespace cuda_decoder {
@@ -39,10 +33,11 @@ const float kSleepForNewTask = 100e-6;
 
 BatchedThreadedNnet3CudaPipeline2::BatchedThreadedNnet3CudaPipeline2(
     const BatchedThreadedNnet3CudaPipeline2Config &config,
+    OnlineNnet2FeaturePipelineInfo &feature_info,
     const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
     const TransitionModel &trans_model)
     : config_(config),
-      cuda_online_pipeline_(config.cuda_online_pipeline_opts, decode_fst,
+      cuda_online_pipeline_(config.cuda_online_pipeline_opts, feature_info, decode_fst,
                             am_nnet, trans_model),
       use_online_features_(config_.use_online_features),
       corr_id_cnt_(0),
@@ -67,8 +62,7 @@ BatchedThreadedNnet3CudaPipeline2::BatchedThreadedNnet3CudaPipeline2(
     n_input_per_chunk_ = cuda_online_pipeline_.GetNSampsPerChunk();
   } else {
     n_input_per_chunk_ = cuda_online_pipeline_.GetNInputFramesPerChunk();
-    cuda_features_.reset(new OnlineCudaFeaturePipeline(
-        config_.cuda_online_pipeline_opts.feature_opts));
+    cuda_features_.reset(new OnlineCudaFeaturePipeline(feature_info));
     wave_buffer_.reset(new HostDeviceVector());
     next_wave_buffer_.reset(new HostDeviceVector());
   }
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
index d08c5782cee..c4548849761 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
@@ -152,6 +152,7 @@ class BatchedThreadedNnet3CudaPipeline2 {
  public:
   BatchedThreadedNnet3CudaPipeline2(
       const BatchedThreadedNnet3CudaPipeline2Config &config,
+      OnlineNnet2FeaturePipelineInfo &info,
       const fst::Fst<fst::StdArc> &decode_fst,
       const nnet3::AmNnetSimple &am_nnet, const TransitionModel &trans_model);
 
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
index add66312817..fc0d2cddd2c 100644
--- a/src/cudadecoder/cuda-decoder-kernels-utils.h
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -137,7 +137,7 @@ __device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
   value.i2 = val;
   if (old.i2.x <= val.x) return;
   do {
-    assumed.ull = old.ull;
+    assumed = old;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
   } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
 }
@@ -148,7 +148,7 @@ __device__ void atomicSubI2(int2 *ptr, int2 sub) {
   UInt64UnionInt2 old, assumed, value;
   old.ull = *ptr64;
   do {
-    assumed.ull = old.ull;
+    assumed = old;
     value.i2.x = assumed.i2.x - sub.x;
     value.i2.y = assumed.i2.y - sub.y;
     old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index e20a7dea15c..3a835d02b76 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -15,21 +15,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
-
-#include "float.h"
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#endif
 #include "cuda-decoder-kernels.h"
 #include "cuda-decoder-kernels-utils.h"
 
-#ifndef FLT_MAX
-#define FLT_MAX 340282346638528859811704183484516925440.0f
-#endif
-
 namespace kaldi {
 namespace cuda_decoder {
 
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 15f29d27122..1ec456ac32c 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -37,15 +37,8 @@
 #include <utility>
 #include <vector>
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include "base/kaldi-utils.h"
 #include "cudadecoder/cuda-decoder-kernels.h"
@@ -191,36 +184,35 @@ void CudaDecoder::AllocateDeviceData() {
 void CudaDecoder::AllocateHostData() {
   channel_to_compute_.resize(nlanes_);
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_extra_and_acoustic_cost_concat_,
+      &h_extra_and_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_acoustic_cost_concat_,
+      &h_acoustic_cost_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_extra_prev_tokens_concat_,
+      &h_extra_prev_tokens_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_infotoken_concat_,
+      &h_infotoken_concat_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost((void **)&h_extra_and_acoustic_cost_concat_tmp_,
+      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
                      nlanes_ * main_q_capacity_ *
                          sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_acoustic_cost_concat_tmp_,
+      &h_acoustic_cost_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_extra_prev_tokens_concat_tmp_,
+      &h_extra_prev_tokens_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
-      (void **)&h_infotoken_concat_tmp_,
+      &h_infotoken_concat_tmp_,
       nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
   h_lanes_counters_.Resize(
       nlanes_ + 1,
       1);  // +1 because we sometimes need last+1 value (for offsets)
-  KALDI_DECODER_CUDA_API_CHECK_ERROR(
-      cudaMallocHost((void **)&h_channels_counters_,
-                     nchannels_ * sizeof(*h_channels_counters_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
 
   h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
   h_all_tokens_acoustic_cost_.resize(nchannels_);
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index f6ee37512e2..de2bd09f47c 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -20,13 +20,7 @@
 
 #if HAVE_CUDA
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
-#endif
 
 #include <atomic>
 #include <cfloat>
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
index 6b0d34f81b7..56066ee069d 100644
--- a/src/cudadecoder/cuda-fst.cc
+++ b/src/cudadecoder/cuda-fst.cc
@@ -22,15 +22,8 @@
 #include "cudadecoder/cuda-fst.h"
 #include "cudamatrix/cu-common.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 namespace kaldi {
 namespace cuda_decoder {
diff --git a/src/cudadecoder/lattice-postprocessor.cc b/src/cudadecoder/lattice-postprocessor.cc
index 49f96191787..46d44216890 100644
--- a/src/cudadecoder/lattice-postprocessor.cc
+++ b/src/cudadecoder/lattice-postprocessor.cc
@@ -78,14 +78,13 @@ bool LatticePostprocessor::GetPostprocessedLattice(
   KALDI_ASSERT(decoder_frame_shift_ != 0.0 &&
                "SetDecoderFrameShift() must be called (typically by pipeline)");
 
-  if (word_info_) {
-    // ok &=
-    // Ignoring the return false for now (but will print a warning),
-    // because the doc says we can, and it can happen when using endpointing
-    WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
-  } else {
-    *out_clat = clat;
-  }
+  if (!word_info_)
+    KALDI_ERR << "You must set --word-boundary-rxfilename in the lattice "
+                 "postprocessor config";
+  // ok &=
+  // Ignoring the return false for now (but will print a warning),
+  // because the doc says we can, and it can happen when using endpointing
+  WordAlignLattice(clat, *tmodel_, *word_info_, max_states, out_clat);
   return ok;
 }
 
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
index 96b00c06101..1f093299eb4 100644
--- a/src/cudadecoderbin/Makefile
+++ b/src/cudadecoderbin/Makefile
@@ -2,15 +2,13 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
-endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index a47ea2e2300..70908cbea0c 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -23,15 +23,9 @@
 #error CUDA support must be configured to compile this binary.
 #endif
 
-#ifdef __IS_HIP_COMPILE__
-#include "hip/hip_runtime.h"
-#include "hipify.h"
-#include "roctracer/roctx.h"
-#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include <algorithm>
 #include <iomanip>
@@ -85,8 +79,9 @@ int main(int argc, char *argv[]) {
     fst::Fst<fst::StdArc> *decode_fst;
     fst::SymbolTable *word_syms;
     ReadModels(opts, &trans_model, &am_nnet, &decode_fst, &word_syms);
+    OnlineNnet2FeaturePipelineInfo feature_info(opts.feature_config);
     BatchedThreadedNnet3CudaOnlinePipeline cuda_pipeline(
-        opts.batched_decoder_config, *decode_fst, am_nnet, trans_model);
+        opts.batched_decoder_config, feature_info, *decode_fst, am_nnet, trans_model);
     delete decode_fst;
     if (word_syms) cuda_pipeline.SetSymbolTable(*word_syms);
 
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
index 06aac47b5e0..46138116bd8 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -17,15 +17,9 @@
 
 #if HAVE_CUDA == 1
 
-#ifdef __IS_HIP_COMPILE__
-#include "hip/hip_runtime.h"
-#include "hipify.h"
-#include "roctracer/roctx.h"
-#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 #include <sstream>
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
 #include "cudamatrix/cu-allocator.h"
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index b7a9d463214..e6513f9fc7f 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -18,17 +18,9 @@
 #include <atomic>
 #if HAVE_CUDA == 1
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
 #include <cuda.h>
 #include <cuda_profiler_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include <sstream>
 
@@ -101,9 +93,11 @@ int main(int argc, char *argv[]) {
 
     // Multi-threaded CPU and batched GPU decoder
     BatchedThreadedNnet3CudaPipeline2Config batched_decoder_config;
+    OnlineNnet2FeaturePipelineConfig feature_config;
     CuDevice::RegisterDeviceOptions(&po);
     RegisterCuAllocatorOptions(&po);
     batched_decoder_config.Register(&po);
+    feature_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -121,6 +115,8 @@ int main(int argc, char *argv[]) {
     std::shared_ptr<TransitionModel> trans_model(new TransitionModel());
 
     nnet3::AmNnetSimple am_nnet;
+    // Read feature info
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
 
     // read transition model and nnet
     bool binary;
@@ -145,7 +141,7 @@ int main(int argc, char *argv[]) {
           KALDI_CUDA_DECODER_BIN_MAX_SEGMENT_LENGTH_S;
     }
     BatchedThreadedNnet3CudaPipeline2 cuda_pipeline(
-        batched_decoder_config, *decode_fst, am_nnet, *trans_model);
+        batched_decoder_config, feature_info, *decode_fst, am_nnet, *trans_model);
 
     delete decode_fst;
 
diff --git a/src/cudadecoderbin/cuda-bin-tools.h b/src/cudadecoderbin/cuda-bin-tools.h
index 0cf21a9f5f4..31fd3716f3e 100644
--- a/src/cudadecoderbin/cuda-bin-tools.h
+++ b/src/cudadecoderbin/cuda-bin-tools.h
@@ -67,6 +67,7 @@ struct CudaOnlineBinaryOptions {
       wav_rspecifier, clat_wspecifier;
   std::string lattice_postprocessor_config_rxfilename;
   BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+  OnlineNnet2FeaturePipelineConfig feature_config;
 };
 
 inline int SetUpAndReadCmdLineOptions(int argc, char *argv[],
@@ -107,6 +108,7 @@ inline int SetUpAndReadCmdLineOptions(int argc, char *argv[],
   CuDevice::RegisterDeviceOptions(&po);
   RegisterCuAllocatorOptions(&po);
   opts.batched_decoder_config.Register(&po);
+  opts.feature_config.Register(&po);
 
   po.Read(argc, argv);
 
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
index d7739dae623..54bcc53af1e 100644
--- a/src/cudafeat/Makefile
+++ b/src/cudafeat/Makefile
@@ -2,15 +2,13 @@ all: ;
 
 include ../kaldi.mk
 
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifeq ($(CUDA), true)
 ifndef CUDA_ARCH
   $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
-endif
 
 TESTFILES =
 
@@ -39,14 +37,9 @@ LDLIBS += $(CUDA_LDLIBS)
 
 
 # Implicit rule for kernel compilation
-ifeq ($(CUDA), true)
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
-endif
-ifeq ($(ROCM), true)
-%.o : %.cu
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../ -I$(OPENFSTINC)
-endif
+
 else
 all:
 		$(warning "Not building cudadecoder extension -- to build with it, configure with --with-cudadecoder[=true]")
diff --git a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
index 1df9c6a7a43..d803a915ea0 100644
--- a/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-cmvn-cuda-kernels.cu
@@ -15,13 +15,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#endif
 #include "cudafeat/feature-online-batched-cmvn-cuda-kernels.h"
 
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
index 5b94c34e829..0b57d6a32ea 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-ivector-cuda-kernels.cu
@@ -16,13 +16,7 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#endif
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
@@ -51,7 +45,7 @@ void square_batched_matrix(int32_t chunk_frames, int32_t num_cols,
                            const float *feats, int32_t ldf, int32_t stridef,
                            float *feats_sq, int32_t lds, int32_t strides,
                            const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32, 32);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (chunk_frames + threads.y - 1) / threads.y, num_lanes);
 
@@ -102,11 +96,8 @@ void zero_invalid_posteriors(int32_t num_chunk_frames, int32_t num_gauss,
                              float *posteriors, int32_t ldp, int32_t stridep,
                              int32_t right, const LaneDesc *lanes,
                              int32_t num_lanes) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks((num_gauss + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-              (num_chunk_frames + GPU_MAX_WARPS_PER_BLOCK - 1) /
-                  GPU_MAX_WARPS_PER_BLOCK,
-              num_lanes);
+  dim3 threads(32, 32);
+  dim3 blocks((num_gauss + 31) / 32, (num_chunk_frames + 31) / 32, num_lanes);
 
   zero_invalid_posteriors_kernel<<<blocks, threads>>>(
       num_chunk_frames, num_gauss, posteriors, ldp, stridep, right, lanes,
@@ -219,11 +210,8 @@ void splice_features_batched(int32_t num_chunk_frames, int32_t feat_dim,
                              int32_t stridest, float *spliced_feats,
                              int32_t lds, int32_t strides,
                              const LaneDesc *lanes, int32_t num_lanes) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
-                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK)
-    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
-                                          // GPU_MAX_THREADS_PER_BLOCK threads
+  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
+  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
 
   dim3 blocks(num_chunk_frames, num_lanes);
 
@@ -318,10 +306,10 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
     // First we need to shift feats to handle the case where num_chunk_frames
     // is less than stash size
 
-    KALDI_ASSERT(stash_size <= GPU_WARP_SIZE);
-    // This only works if stash size is <= GPU_WARP_SIZE as we rely on
-    // __syncthreads() to avoid read/write hazards when reading/writing in-place
-    dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+    KALDI_ASSERT(stash_size <= 32);
+    // This only works if stash size is <= 32 as we rely on __syncthreads()
+    // to avoid read/write hazards when reading/writing in-place
+    dim3 threads(32, 32);
     dim3 blocks(num_lanes);
 
     shift_feats_kernel<<<blocks, threads>>>(chunk_size, feats, feat_dim, ldf,
@@ -330,11 +318,9 @@ void stash_feats(int32_t chunk_size, const float *feats, int32_t feat_dim,
   }
 
   {
-    int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
-                  GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-    if (threads > GPU_MAX_THREADS_PER_BLOCK)
-      threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
-                                            // GPU_MAX_THREADS_PER_BLOCK threads
+    int threads =
+        (feat_dim + 31) / 32 * 32;       // round up to the nearest warp size
+    if (threads > 1024) threads = 1024;  // Max block size is 1024 threads
     dim3 blocks(stash_size, num_lanes);
 
     // Then we need to copy feats from source into stash
@@ -516,9 +502,8 @@ __global__ void batched_convert_sp_to_dense_kernel(int32_t n, float *A_sp,
 void batched_convert_sp_to_dense(int n, float *A_sp, int32_t strides, float *A,
                                  int32_t lda, int32_t stridea,
                                  const LaneDesc *lanes, int32_t num_lanes) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  int block =
-      (n + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE;  // blocks in x and y dimensions
+  dim3 threads(32, 32);
+  int block = (n + 31) / 32;  // blocks in x and y dimensions
   dim3 blocks(block, block, num_lanes);
 
   batched_convert_sp_to_dense_kernel<<<blocks, threads>>>(
@@ -594,7 +579,7 @@ void initialize_channels(int32_t num_gauss, int32_t feat_dim, float *gamma,
                          int32_t strideg, float *X, int32_t ldx,
                          int32_t stridex, const LaneDesc *lanes,
                          int32_t num_lanes) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32, 32);
   int32_t blocks = num_lanes;
 
   initialize_channels_kernel<<<blocks, threads>>>(
@@ -639,7 +624,7 @@ void apply_and_update_stash(int32_t num_gauss, int32_t feat_dim, float *gamma,
                             int32_t ldx, int32_t stridex, float *X_stash,
                             int32_t lds, int32_t strides, const LaneDesc *lanes,
                             int32_t num_lanes) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32, 32);
   int32_t blocks = num_lanes;
 
   apply_and_update_stash_kernel<<<blocks, threads>>>(
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.cc b/src/cudafeat/feature-online-batched-ivector-cuda.cc
index 1699f8c1e77..c80f43b3563 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.cc
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.cc
@@ -15,28 +15,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __IS_HIP_COMPILE__
-#include "hipify.h"
-// The BLAS enumerators are used instead of the SOLVER ones.
-#ifdef CUBLAS_FILL_MODE_LOWER
-#undef CUBLAS_FILL_MODE_LOWER
-#endif
-#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
-#ifdef CUDA_R_32F
-#undef CUDA_R_32F
-#endif
-#define CUDA_R_32F HIPBLAS_R_32F
-#endif
-
 #include "cudafeat/feature-online-batched-ivector-cuda.h"
 #include "cudafeat/feature-online-batched-ivector-cuda-kernels.h"
 
 namespace kaldi {
 BatchedIvectorExtractorCuda::BatchedIvectorExtractorCuda(
-    const OnlineIvectorExtractionConfig &config,
+    const OnlineIvectorExtractionInfo &info,
     int32_t feat_dim, int32_t chunk_size,
     int32_t num_lanes, int32_t num_channels)
-    : cmvn_(NULL),
+    : info_(info),
+      cmvn_(NULL),
       feat_dim_(feat_dim),
       chunk_size_(chunk_size),
       max_lanes_(num_lanes),
@@ -46,8 +34,7 @@ BatchedIvectorExtractorCuda::BatchedIvectorExtractorCuda(
   // upgrade to a more recent CUDA version.
   KALDI_ERR << "BatchedIvectorExtractorCuda requires CUDA 9.1 or newer.";
 #endif
-  info_.Init(config);
-  Read(config);
+  Read();
 
   naive_cmvn_state_ = OnlineCmvnState(info_.global_cmvn_stats);
   // TODO parameterize coarsening factor?
@@ -113,63 +100,35 @@ BatchedIvectorExtractorCuda::~BatchedIvectorExtractorCuda() {
   CuDevice::Instantiate().Free(ivec_array_);
 }
 
-void BatchedIvectorExtractorCuda::Read(
-    const kaldi::OnlineIvectorExtractionConfig &config) {
-  // read ubm
-  DiagGmm gmm;
-  ReadKaldiObject(config.diag_ubm_rxfilename, &gmm);
-  ubm_gconsts_.Resize(gmm.NumGauss());
-  ubm_gconsts_.CopyFromVec(gmm.gconsts());
-  ubm_means_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
-  ubm_means_inv_vars_.CopyFromMat(gmm.means_invvars());
-  ubm_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
-  ubm_inv_vars_.CopyFromMat(gmm.inv_vars());
-  num_gauss_ = gmm.NumGauss();
-
-  // read extractor (copied from ivector/ivector-extractor.cc)
-  bool binary;
-  Input input(config.ivector_extractor_rxfilename, &binary);
-  Matrix<float> w;
-  Vector<float> w_vec;
-  std::vector<Matrix<float> > ie_M;
-  std::vector<SpMatrix<float> > ie_Sigma_inv;
-
-  ExpectToken(input.Stream(), binary, "<IvectorExtractor>");
-  ExpectToken(input.Stream(), binary, "<w>");
-  w.Read(input.Stream(), binary);
-  ExpectToken(input.Stream(), binary, "<w_vec>");
-  w_vec.Read(input.Stream(), binary);
-  ExpectToken(input.Stream(), binary, "<M>");
-  int32 size;
-  ReadBasicType(input.Stream(), binary, &size);
-  KALDI_ASSERT(size > 0);
-  ie_M.resize(size);
-  for (int32 i = 0; i < size; i++) {
-    ie_M[i].Read(input.Stream(), binary);
-  }
-  ExpectToken(input.Stream(), binary, "<SigmaInv>");
-  ie_Sigma_inv.resize(size);
-  for (int32 i = 0; i < size; i++) {
-    ie_Sigma_inv[i].Read(input.Stream(), binary);
-  }
-  ExpectToken(input.Stream(), binary, "<IvectorOffset>");
-  ReadBasicType(input.Stream(), binary, &prior_offset_);
-  ExpectToken(input.Stream(), binary, "</IvectorExtractor>");
+void BatchedIvectorExtractorCuda::Read() {
+
+  // Pick gmm values
+  ubm_gconsts_.Resize(info_.diag_ubm.NumGauss());
+  ubm_gconsts_.CopyFromVec(info_.diag_ubm.gconsts());
+  ubm_means_inv_vars_.Resize(info_.diag_ubm.NumGauss(), info_.diag_ubm.Dim());
+  ubm_means_inv_vars_.CopyFromMat(info_.diag_ubm.means_invvars());
+  ubm_inv_vars_.Resize(info_.diag_ubm.NumGauss(), info_.diag_ubm.Dim());
+  ubm_inv_vars_.CopyFromMat(info_.diag_ubm.inv_vars());
+  num_gauss_ = info_.diag_ubm.NumGauss();
+
+  // Pick and recompute values
+  const std::vector<Matrix<double> > &ie_M = info_.extractor.M_;
+  const std::vector<SpMatrix<double> > &ie_Sigma_inv = info_.extractor.Sigma_inv_;
+  prior_offset_ = info_.extractor.prior_offset_;
 
   // compute derived variables
   ivector_dim_ = ie_M[0].NumCols();
   lda_dim_ = ie_M[0].NumRows();
 
   ie_Sigma_inv_M_f_.Resize(num_gauss_ * lda_dim_, ivector_dim_, kUndefined);
-
   ie_U_.Resize(num_gauss_, ivector_dim_ * (ivector_dim_ + 1) / 2);
 
-  SpMatrix<float> tmp_sub_U(ivector_dim_);
-  Matrix<float> tmp_Sigma_inv_M(lda_dim_, ivector_dim_);
+  SpMatrix<double> tmp_sub_U(ivector_dim_);
+  Matrix<double> tmp_Sigma_inv_M(lda_dim_, ivector_dim_);
   for (int32 i = 0; i < num_gauss_; i++) {
     // compute matrix ie_Sigma_inv_M[i]
     tmp_sub_U.AddMat2Sp(1, ie_M[i], kTrans, ie_Sigma_inv[i], 0);
-    SubVector<float> tmp_U_vec(tmp_sub_U.Data(),
+    SubVector<double> tmp_U_vec(tmp_sub_U.Data(),
                                ivector_dim_ * (ivector_dim_ + 1) / 2);
     ie_U_.Row(i).CopyFromVec(tmp_U_vec);
 
diff --git a/src/cudafeat/feature-online-batched-ivector-cuda.h b/src/cudafeat/feature-online-batched-ivector-cuda.h
index edb8bfe9206..48310184fb7 100644
--- a/src/cudafeat/feature-online-batched-ivector-cuda.h
+++ b/src/cudafeat/feature-online-batched-ivector-cuda.h
@@ -29,7 +29,7 @@ namespace kaldi {
 
 class BatchedIvectorExtractorCuda {
  public:
-  BatchedIvectorExtractorCuda(const OnlineIvectorExtractionConfig &config,
+  BatchedIvectorExtractorCuda(const OnlineIvectorExtractionInfo &info,
                               int32_t feat_dim,
                               int32_t chunk_size, int32_t num_lanes,
                               int32_t num_channels);
@@ -64,12 +64,12 @@ class BatchedIvectorExtractorCuda {
   int32 NumGauss() const { return num_gauss_; }
 
  private:
-  OnlineIvectorExtractionInfo info_;
+  const OnlineIvectorExtractionInfo &info_;
 
   BatchedIvectorExtractorCuda(BatchedIvectorExtractorCuda const &);
   BatchedIvectorExtractorCuda &operator=(BatchedIvectorExtractorCuda const &);
 
-  void Read(const kaldi::OnlineIvectorExtractionConfig &config);
+  void Read();
 
   void InitializeChannels(const LaneDesc *lanes, int32_t num_lanes);
 
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
index bc06ea32d69..c43adaccc2e 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
+++ b/src/cudafeat/feature-online-batched-spectral-cuda-kernels.cu
@@ -17,16 +17,8 @@
 
 #include "cudafeat/feature-online-batched-spectral-cuda-kernels.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include "cudafeat/lane-desc.h"
 #include "cudamatrix/cu-rand.h"
@@ -70,7 +62,7 @@ __global__ void batched_mel_banks_compute_kernel(
   // perfom local sum
   float sum = 0;
   if (frame < num_frames) {  // exclude frames beyond the end
-    for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
+    for (int idx = tid; idx < size; idx += 32) {
       sum += v[idx] * w[idx];
     }
   }
@@ -489,7 +481,7 @@ void cuda_mel_banks_compute(const LaneDesc *lanes, int32_t num_lanes,
                             float energy_floor, int32 *offsets, int32 *sizes,
                             float **vecs, const float *feats, int32_t ldf,
                             float *mels, int32_t ldm, bool use_log) {
-  dim3 Bl(GPU_WARP_SIZE, 8);
+  dim3 Bl(32, 8);
   dim3 Gr(num_bins, (max_chunk_frames + Bl.y - 1) / Bl.y, num_lanes);
   batched_mel_banks_compute_kernel<<<Gr, Bl>>>(
       lanes, num_lanes, max_chunk_frames, energy_floor, offsets, sizes, vecs,
diff --git a/src/cudafeat/feature-online-batched-spectral-cuda.h b/src/cudafeat/feature-online-batched-spectral-cuda.h
index d18f5237e8f..e4549c7177c 100644
--- a/src/cudafeat/feature-online-batched-spectral-cuda.h
+++ b/src/cudafeat/feature-online-batched-spectral-cuda.h
@@ -19,14 +19,8 @@
 #define KALDI_CUDAFEAT_FEATURE_BATCHED_SPECTRAL_CUDA_H_
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hipfft/hipfft.h>
-
-#include "hipify.h"
-#else
 #include <cufft.h>
 #endif
-#endif
 
 #include "cudafeat/feature-spectral-cuda.h"
 #include "cudafeat/feature-window-cuda.h"
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
index e432fe56573..ba13b4fe484 100644
--- a/src/cudafeat/feature-online-cmvn-cuda.cu
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -15,21 +15,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __IS_HIP_COMPILE__
-#define __CUDA_ARCH__ 800
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#endif
-
 #include "cudafeat/feature-online-cmvn-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 
-// HIP builds do not required packed floating point operators definition.
-#ifndef __IS_HIP_COMPILE__
 __host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
   float2 retval;
   retval.x = a.x - b.x;
@@ -42,7 +32,6 @@ __host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
   retval.y = a.y + b.y;
   return retval;
 }
-#endif
 
 #if __CUDA_ARCH__ == 750
 __launch_bounds__ (1024, 1)
@@ -190,9 +179,8 @@ void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
       stats.Stride());
   CU_SAFE_CALL(cudaGetLastError());
 
-  threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
-            GPU_MAX_WARPS_PER_BLOCK;  // round up to GPU_WARP_SIZE threads
-  if (threads > GPU_MAX_THREADS_PER_BLOCK) threads = GPU_MAX_THREADS_PER_BLOCK;
+  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
+  if (threads > 1024) threads = 1024;
 
   const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
   const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
index 7b514010562..3912661c4fd 100644
--- a/src/cudafeat/feature-spectral-cuda.cu
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -17,16 +17,8 @@
 
 #include "cudafeat/feature-spectral-cuda.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
+#include <nvToolsExt.h>
 #include <cub/cub.cuh>
-#endif
 
 #include "cudamatrix/cu-rand.h"
 
@@ -136,7 +128,7 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
 
   // perfom local sum
   float sum = 0;
-  for (int idx = tid; idx < size; idx += GPU_WARP_SIZE) {
+  for (int idx = tid; idx < size; idx += 32) {
     sum += v[idx] * w[idx];
   }
 
@@ -495,7 +487,7 @@ void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_w
   // mel banks
   int num_bins = bin_size_;
   cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
-  dim3 mel_threads(GPU_WARP_SIZE, 8);
+  dim3 mel_threads(32, 8);
   dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
   mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
       num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
index b0e4a24c8d2..8683372098c 100644
--- a/src/cudafeat/feature-spectral-cuda.h
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -19,14 +19,8 @@
 #define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hipfft/hipfft.h>
-
-#include "hipify.h"
-#else
 #include <cufft.h>
 #endif
-#endif
 
 #include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
index e001eb0790f..b8db5bd46d3 100644
--- a/src/cudafeat/feature-window-cuda.cu
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -17,13 +17,7 @@
 
 #include "cudafeat/feature-window-cuda.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 #include "matrix/matrix-functions.h"
 
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.cc b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
index e03fda01ca7..06819f34f43 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.cc
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.cc
@@ -20,20 +20,14 @@
 
 #include "cudafeat/online-batched-feature-pipeline-cuda.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 
 namespace kaldi {
 
 OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
-    const OnlineNnet2FeaturePipelineConfig &config,
+    const OnlineNnet2FeaturePipelineInfo &info,
     int32_t max_chunk_size_samples, int32_t max_lanes, int32_t num_channels)
-    : info_(config),
+    : info_(info),
       cmvn_(NULL),
       max_chunk_size_samples_(max_chunk_size_samples),
       max_lanes_(max_lanes),
@@ -87,12 +81,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   }
 
   if (info_.use_ivectors) {
-    OnlineIvectorExtractionConfig ivector_extraction_opts;
-    ReadConfigFromFile(config.ivector_extraction_config,
-                       &ivector_extraction_opts);
-    info_.ivector_extractor_info.Init(ivector_extraction_opts);
-
-    ivector_ = new BatchedIvectorExtractorCuda(ivector_extraction_opts,
+    ivector_ = new BatchedIvectorExtractorCuda(info_.ivector_extractor_info,
                                                FeatureDim(),
                                                max_chunk_size_frames_,
                                                max_lanes_, num_channels_);
@@ -101,8 +90,7 @@ OnlineBatchedFeaturePipelineCuda::OnlineBatchedFeaturePipelineCuda(
   current_samples_stash_ = new int32_t[num_channels_];
 
   // allocated pinned memory for storing channel desc
-  CU_SAFE_CALL(
-      cudaMallocHost((void **)&h_lanes_, sizeof(LaneDesc) * max_lanes_));
+  CU_SAFE_CALL(cudaMallocHost(&h_lanes_, sizeof(LaneDesc) * max_lanes_));
 
   // allocate device memory
   lanes_ =
diff --git a/src/cudafeat/online-batched-feature-pipeline-cuda.h b/src/cudafeat/online-batched-feature-pipeline-cuda.h
index 6c588c40c24..57971bedb8f 100644
--- a/src/cudafeat/online-batched-feature-pipeline-cuda.h
+++ b/src/cudafeat/online-batched-feature-pipeline-cuda.h
@@ -23,10 +23,6 @@
 #include <string>
 #include <vector>
 
-#ifdef __IS_HIP_COMPILE__
-#include "hipify.h"
-#endif
-
 #include "base/kaldi-error.h"
 #include "feat/feature-window.h"
 #include "matrix/matrix-lib.h"
@@ -43,8 +39,9 @@ namespace kaldi {
 
 class OnlineBatchedFeaturePipelineCuda {
  public:
+
   explicit OnlineBatchedFeaturePipelineCuda(
-      const OnlineNnet2FeaturePipelineConfig &config, int32_t max_chunk_size,
+      const OnlineNnet2FeaturePipelineInfo &feature_info, int32_t max_chunk_size,
       int32_t max_lanes, int32_t num_channels);
 
   // Computes features and ivectors for a batched chunk of audio data.
@@ -111,7 +108,7 @@ class OnlineBatchedFeaturePipelineCuda {
   const FrameExtractionOptions &GetFrameOptions() { return frame_opts_; }
 
  private:
-  OnlineNnet2FeaturePipelineInfo info_;
+  const OnlineNnet2FeaturePipelineInfo &info_;
 
   CudaOnlineBatchedSpectralFeatures *spectral_feat_;
   CudaOnlineBatchedCmvn *cmvn_;
diff --git a/src/cudafeat/online-cuda-feature-pipeline.cc b/src/cudafeat/online-cuda-feature-pipeline.cc
index 58563bba99f..8da8ff75614 100644
--- a/src/cudafeat/online-cuda-feature-pipeline.cc
+++ b/src/cudafeat/online-cuda-feature-pipeline.cc
@@ -22,8 +22,8 @@
 namespace kaldi {
 
 OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
-    const OnlineNnet2FeaturePipelineConfig &config)
-    : info_(config), spectral_feat(NULL), ivector(NULL) {
+    const OnlineNnet2FeaturePipelineInfo &info)
+    : info_(info), spectral_feat(NULL), ivector(NULL) {
   spectral_feat = NULL;
   cmvn = NULL;
   ivector = NULL;
@@ -44,16 +44,7 @@ OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
   }
 
   if (info_.use_ivectors) {
-    OnlineIvectorExtractionConfig ivector_extraction_opts;
-    ReadConfigFromFile(config.ivector_extraction_config,
-                       &ivector_extraction_opts);
-    info_.ivector_extractor_info.Init(ivector_extraction_opts);
-
-    // Only these ivector options are currently supported
-    ivector_extraction_opts.use_most_recent_ivector = true;
-    ivector_extraction_opts.greedy_ivector_extractor = true;
-
-    ivector = new IvectorExtractorFastCuda(ivector_extraction_opts);
+    ivector = new IvectorExtractorFastCuda(info_.ivector_extractor_info);
   }
 }
 
diff --git a/src/cudafeat/online-cuda-feature-pipeline.h b/src/cudafeat/online-cuda-feature-pipeline.h
index f3d2795e3fb..2f9ac4cc688 100644
--- a/src/cudafeat/online-cuda-feature-pipeline.h
+++ b/src/cudafeat/online-cuda-feature-pipeline.h
@@ -36,7 +36,7 @@ namespace kaldi {
 class OnlineCudaFeaturePipeline {
  public:
   explicit OnlineCudaFeaturePipeline(
-      const OnlineNnet2FeaturePipelineConfig &config);
+      const OnlineNnet2FeaturePipelineInfo &info);
 
   void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
                        BaseFloat sample_freq,
@@ -46,7 +46,7 @@ class OnlineCudaFeaturePipeline {
   ~OnlineCudaFeaturePipeline();
 
  private:
-  OnlineNnet2FeaturePipelineInfo info_;
+  const OnlineNnet2FeaturePipelineInfo &info_;
   CudaSpectralFeatures *spectral_feat;
   CudaOnlineCmvn *cmvn;
   IvectorExtractorFastCuda *ivector;
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
index b7128dec7e6..12d9b071f59 100644
--- a/src/cudafeat/online-ivector-feature-cuda-kernels.cu
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -15,32 +15,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __IS_HIP_COMPILE__
-#include <hipcub/hipcub.hpp>
-
-#include "hipify.h"
-#else
 #include <cub/cub.cuh>
-#endif
-
 #include "cudafeat/online-ivector-feature-cuda-kernels.h"
 #include "cudamatrix/cu-common.h"
 namespace kaldi {
 
-// Meant to be called with blockDim = GPU_WARP_SIZE x GPU_MAX_WARPS_PER_BLOCK
+// Meant to be called with blockDim= 32x32
 __global__ void batched_gemv_reduce_kernel(int rows, int cols,
                                            const float* __restrict__ A, int lda,
                                            const float* __restrict__ X, int ldx,
                                            float* C) {
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
-  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
-  __shared__
-      typename WarpReduce::TempStorage temp_storage[GPU_MAX_WARPS_PER_BLOCK];
+  // Allocate WarpReduce shared memory for 32 warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[32];
 
-  __shared__ float
-      s_A[GPU_MAX_WARPS_PER_BLOCK]
-         [GPU_WARP_SIZE + 1];  //+1 to avoid bank conflicts on transpose
+  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
 
   int bid = blockIdx.x;   // batch id
   int tid = threadIdx.x;  // thread id
@@ -51,15 +41,13 @@ __global__ void batched_gemv_reduce_kernel(int rows, int cols,
   // Offset to input vector to starting column for batch
   const float* __restrict__ X_in = X + bid * ldx;
 
-  for (int i = 0; i < cols;
-       i += GPU_WARP_SIZE) {  // threadIdx.x, keep all threads present
+  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
     int c = i + tid;
 
     float sum = 0.0f;
     // Perform dot product
     for (int j = 0; j < rows;
-         j +=
-         GPU_MAX_WARPS_PER_BLOCK) {  // threadIdx.y, keep all threads present
+         j += 32) {  // threadIdx.y, keep all threads present
       int r = j + wid;
 
       float val = 0.0f;
@@ -145,11 +133,9 @@ __global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
                                                     int32_t lda, float scale,
                                                     float* retval) {
   // Specialize WarpReduce for type float
-  typedef cub::BlockReduce<float, GPU_WARP_SIZE,
-                           cub::BLOCK_REDUCE_WARP_REDUCTIONS,
-                           GPU_MAX_WARPS_PER_BLOCK>
+  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
       BlockReduce;
-  // Allocate WarpReduce shared memory for GPU_MAX_WARPS_PER_BLOCK warps
+  // Allocate WarpReduce shared memory for 32 warps
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   float sum = 0.0f;
@@ -215,8 +201,7 @@ __global__ void update_linear_and_quadratic_terms_kernel(
 void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
                          const float* AT, int B_stride, const float* B,
                          float* C) {
-  batched_gemv_reduce_kernel<<<batch_size,
-                               dim3(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK)>>>(
+  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
       rows, cols, AT, A_stride, B, B_stride, C);
   CU_SAFE_CALL(cudaGetLastError());
 }
@@ -224,11 +209,8 @@ void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
 void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
                      int32_t size, const float* feats, int32_t ldf,
                      float* sfeats, int32_t lds) {
-  int threads = (feat_dim + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE *
-                GPU_MAX_WARPS_PER_BLOCK;  // round up to the nearest warp size
-  if (threads > GPU_MAX_THREADS_PER_BLOCK)
-    threads = GPU_MAX_THREADS_PER_BLOCK;  // Max block size is
-                                          // GPU_MAX_THREADS_PER_BLOCK threads
+  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
+  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
 
   splice_features_kernel<<<num_frames, threads>>>(
       num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
@@ -250,7 +232,7 @@ void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
 void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
                                   float* A, int32_t lda, float scale,
                                   float* sum) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32, 32);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
@@ -261,7 +243,7 @@ void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
 
 void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
                    int32_t ldf, float* feats_sq, int32_t lds) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32, 32);
   dim3 blocks((num_cols + threads.x - 1) / threads.x,
               (num_rows + threads.y - 1) / threads.y);
 
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
index daf1c7dfbf9..287d0ab470e 100644
--- a/src/cudafeat/online-ivector-feature-cuda.cc
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -16,20 +16,8 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-// The BLAS enumerators are used instead of the SOLVER ones.
-#ifdef CUBLAS_FILL_MODE_LOWER
-#undef CUBLAS_FILL_MODE_LOWER
-#endif
-#define CUBLAS_FILL_MODE_LOWER HIPSOLVER_FILL_MODE_LOWER
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 #endif
-
 #include <iostream>
 
 #include "base/io-funcs.h"
@@ -132,48 +120,20 @@ void IvectorExtractorFastCuda::GetIvector(const CuMatrixBase<BaseFloat> &feats,
   nvtxRangePop();
 }
 
-void IvectorExtractorFastCuda::Read(
-    const kaldi::OnlineIvectorExtractionConfig &config) {
+void IvectorExtractorFastCuda::Read() {
   // read ubm
-  DiagGmm gmm;
-  ReadKaldiObject(config.diag_ubm_rxfilename, &gmm);
-  ubm_gconsts_.Resize(gmm.NumGauss());
-  ubm_gconsts_.CopyFromVec(gmm.gconsts());
-  ubm_means_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
-  ubm_means_inv_vars_.CopyFromMat(gmm.means_invvars());
-  ubm_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
-  ubm_inv_vars_.CopyFromMat(gmm.inv_vars());
-  num_gauss_ = gmm.NumGauss();
-
-  // read extractor (copied from ivector/ivector-extractor.cc)
-  bool binary;
-  Input input(config.ivector_extractor_rxfilename, &binary);
-  Matrix<float> w;
-  Vector<float> w_vec;
-  std::vector<Matrix<float> > ie_M;
-  std::vector<SpMatrix<float> > ie_Sigma_inv;
-
-  ExpectToken(input.Stream(), binary, "<IvectorExtractor>");
-  ExpectToken(input.Stream(), binary, "<w>");
-  w.Read(input.Stream(), binary);
-  ExpectToken(input.Stream(), binary, "<w_vec>");
-  w_vec.Read(input.Stream(), binary);
-  ExpectToken(input.Stream(), binary, "<M>");
-  int32 size;
-  ReadBasicType(input.Stream(), binary, &size);
-  KALDI_ASSERT(size > 0);
-  ie_M.resize(size);
-  for (int32 i = 0; i < size; i++) {
-    ie_M[i].Read(input.Stream(), binary);
-  }
-  ExpectToken(input.Stream(), binary, "<SigmaInv>");
-  ie_Sigma_inv.resize(size);
-  for (int32 i = 0; i < size; i++) {
-    ie_Sigma_inv[i].Read(input.Stream(), binary);
-  }
-  ExpectToken(input.Stream(), binary, "<IvectorOffset>");
-  ReadBasicType(input.Stream(), binary, &prior_offset_);
-  ExpectToken(input.Stream(), binary, "</IvectorExtractor>");
+  ubm_gconsts_.Resize(info_.diag_ubm.NumGauss());
+  ubm_gconsts_.CopyFromVec(info_.diag_ubm.gconsts());
+  ubm_means_inv_vars_.Resize(info_.diag_ubm.NumGauss(), info_.diag_ubm.Dim());
+  ubm_means_inv_vars_.CopyFromMat(info_.diag_ubm.means_invvars());
+  ubm_inv_vars_.Resize(info_.diag_ubm.NumGauss(), info_.diag_ubm.Dim());
+  ubm_inv_vars_.CopyFromMat(info_.diag_ubm.inv_vars());
+  num_gauss_ = info_.diag_ubm.NumGauss();
+
+  // Pick and recompute values
+  const std::vector<Matrix<double> > &ie_M = info_.extractor.M_;
+  const std::vector<SpMatrix<double> > &ie_Sigma_inv = info_.extractor.Sigma_inv_;
+  prior_offset_ = info_.extractor.prior_offset_;
 
   // compute derived variables
   ivector_dim_ = ie_M[0].NumCols();
@@ -183,12 +143,12 @@ void IvectorExtractorFastCuda::Read(
 
   ie_U_.Resize(num_gauss_, ivector_dim_ * (ivector_dim_ + 1) / 2);
 
-  SpMatrix<float> tmp_sub_U(ivector_dim_);
-  Matrix<float> tmp_Sigma_inv_M(feat_dim_, ivector_dim_);
+  SpMatrix<double> tmp_sub_U(ivector_dim_);
+  Matrix<double> tmp_Sigma_inv_M(feat_dim_, ivector_dim_);
   for (int32 i = 0; i < num_gauss_; i++) {
     // compute matrix ie_Sigma_inv_M[i[
     tmp_sub_U.AddMat2Sp(1, ie_M[i], kTrans, ie_Sigma_inv[i], 0);
-    SubVector<float> tmp_U_vec(tmp_sub_U.Data(),
+    SubVector<double> tmp_U_vec(tmp_sub_U.Data(),
                                ivector_dim_ * (ivector_dim_ + 1) / 2);
     ie_U_.Row(i).CopyFromVec(tmp_U_vec);
 
diff --git a/src/cudafeat/online-ivector-feature-cuda.h b/src/cudafeat/online-ivector-feature-cuda.h
index f6fe1e65cb9..62fc95d3110 100644
--- a/src/cudafeat/online-ivector-feature-cuda.h
+++ b/src/cudafeat/online-ivector-feature-cuda.h
@@ -29,20 +29,19 @@ namespace kaldi {
 
 class IvectorExtractorFastCuda {
  public:
-  IvectorExtractorFastCuda(const OnlineIvectorExtractionConfig &config)
-      : b_(0), tot_post_(2) {
-    if (config.use_most_recent_ivector == false) {
+  IvectorExtractorFastCuda(const OnlineIvectorExtractionInfo &info)
+      : info_(info), b_(0), tot_post_(2) {
+    if (info_.use_most_recent_ivector == false) {
       KALDI_WARN
           << "IvectorExractorFastCuda: Ignoring use_most_recent_ivector=false.";
     }
-    if (config.greedy_ivector_extractor == false) {
+    if (info_.greedy_ivector_extractor == false) {
       KALDI_WARN << "IvectorExractorFastCuda: Ignoring "
                     "greedy_ivector_extractor=false.";
     }
 
-    info_.Init(config);
+    Read();
     naive_cmvn_state_ = OnlineCmvnState(info_.global_cmvn_stats);
-    Read(config);
     cu_lda_.Resize(info_.lda_mat.NumRows(), info_.lda_mat.NumCols());
     cu_lda_.CopyFromMat(info_.lda_mat);
 
@@ -84,12 +83,12 @@ class IvectorExtractorFastCuda {
   int32 NumGauss() const { return num_gauss_; }
 
  private:
-  OnlineIvectorExtractionInfo info_;
+  const OnlineIvectorExtractionInfo &info_;
 
   IvectorExtractorFastCuda(IvectorExtractorFastCuda const &);
   IvectorExtractorFastCuda &operator=(IvectorExtractorFastCuda const &);
 
-  void Read(const kaldi::OnlineIvectorExtractionConfig &config);
+  void Read();
 
   void SpliceFeats(const CuMatrixBase<BaseFloat> &feats,
                    CuMatrix<BaseFloat> *spliced_feats);
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
index ed1c413c939..9dbb5d30fa1 100644
--- a/src/cudafeatbin/Makefile
+++ b/src/cudafeatbin/Makefile
@@ -3,14 +3,12 @@ all: ;
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
 ifeq ($(WITH_CUDADECODER), true)
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
-ifeq ($(CUDA), true)
-  ifndef CUDA_ARCH
-    $(error CUDA_ARCH is undefined, run 'src/configure')
-  endif
+ifndef CUDA_ARCH
+  $(error CUDA_ARCH is undefined, run 'src/configure')
 endif
 
 LDFLAGS += $(CUDA_LDFLAGS)
diff --git a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
index 44ef403f21a..24e7cbd4a70 100644
--- a/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
+++ b/src/cudafeatbin/apply-batched-cmvn-online-cuda.cc
@@ -18,10 +18,8 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
-#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
index ff9415b8f11..36cfc4ad90c 100644
--- a/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-fbank-online-batched-cuda.cc
@@ -16,10 +16,8 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
-#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
index 3fcc1aea659..99883f3114a 100644
--- a/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
+++ b/src/cudafeatbin/compute-mfcc-online-batched-cuda.cc
@@ -16,10 +16,8 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
 #endif
-#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudafeatbin/compute-online-feats-batched-cuda.cc b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
index e3f2ed75d30..787aceeca0d 100644
--- a/src/cudafeatbin/compute-online-feats-batched-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-batched-cuda.cc
@@ -16,10 +16,8 @@
 // limitations under the License.
 
 #if HAVE_CUDA
-#ifndef __IS_HIP_COMPILE__
 #include <cuda_profiler_api.h>
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 #endif
 
 #include <string>
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
index d54ba56be84..b9135c3cee6 100644
--- a/src/cudafeatbin/compute-online-feats-cuda.cc
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -16,9 +16,7 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifndef __IS_HIP_COMPILE__
-#include <nvtx3/nvToolsExt.h>
-#endif
+#include <nvToolsExt.h>
 #endif
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 45c10b78899..45c2ba44fd7 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -12,7 +12,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
            cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
   OBJFILES += cu-kernels.o
 endif
 
@@ -27,15 +27,8 @@ ifeq ($(CUDA), true)
   endif
 endif
 
-ifeq ($(CUDA), true)
 # Implicit rule for kernel compilation,
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
-endif
-
-ifeq ($(ROCM), true)
-%.o : %.cu
-	$(HIPCC) -c -x hip $< -o $@ $(ROCM_INCLUDE) $(ROCM_FLAGS) $(ROCM_ARCH_FLAGS) -I../
-endif
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index c4cceedca48..e438c604509 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -23,16 +23,9 @@
 
 #if HAVE_CUDA == 1
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#endif
 
 #include <string>
 #include <vector>
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 3edd9f1ca40..d7d65da806a 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,18 +23,10 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #endif
-#endif
 
 #include <map>
 #include <set>
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index b8c250c6771..53de59fe4fc 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -28,13 +28,7 @@
 #include <algorithm>
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
-#endif
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-kernels.h"
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
index 2a29338aeb1..53eccdd44c5 100644
--- a/src/cudamatrix/cu-array.cc
+++ b/src/cudamatrix/cu-array.cc
@@ -22,14 +22,8 @@
 #include <vector>
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index 3db44bf4aa5..84f78f00a91 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -105,12 +105,13 @@ class CuArrayBase {
  protected:
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuArrayBase(): data_(NULL), dim_(0) { }
+  CuArrayBase<T>(): data_(NULL), dim_(0) { }
 
 
   T *data_;  ///< GPU data pointer (if GPU not available,
              ///< will point to CPU memory).
   MatrixIndexT dim_;     ///< dimension of the vector
+
 };
 
 /**
@@ -122,21 +123,22 @@ class CuArrayBase {
 template<typename T>
 class CuArray: public CuArrayBase<T> {
  public:
+
   /// Default constructor, initialized data_ to NULL and dim_ to 0 via
   /// constructor of CuArrayBase.
-  CuArray() { }
+  CuArray<T>() { }
 
   /// Constructor with memory initialisation.  resize_type may be kSetZero or
   /// kUndefined.
-  explicit CuArray(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
+  explicit CuArray<T>(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero)
      { Resize(dim, resize_type); }
 
   /// Constructor from CPU-based int vector
-  explicit CuArray(const std::vector<T> &src) { CopyFromVec(src); }
+  explicit CuArray<T>(const std::vector<T> &src) { CopyFromVec(src); }
 
   /// Copy constructor.  We don't make this explicit because we want to be able
   /// to create a std::vector<CuArray>.
-  CuArray(const CuArray<T> &src) { CopyFromArray(src); }
+  CuArray<T>(const CuArray<T> &src) { CopyFromArray(src); }
 
   /// Destructor
   ~CuArray() { Destroy(); }
@@ -170,6 +172,7 @@ class CuArray: public CuArrayBase<T> {
   /// I/O
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &is, bool binary) const;
+
 };
 
 
@@ -179,7 +182,7 @@ class CuSubArray: public CuArrayBase<T> {
   /// Constructor as a range of an existing CuArray or CuSubArray.  Note: like
   /// similar constructors in class CuVector and others, it can be used to evade
   /// 'const' constraints; don't do that.
-  explicit CuSubArray(const CuArrayBase<T> &src,
+  explicit CuSubArray<T>(const CuArrayBase<T> &src,
                          MatrixIndexT offset, MatrixIndexT dim);
 
   /// Construct from raw pointers
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 63cf33f98b2..e0c64912207 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -19,16 +19,9 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include <algorithm>
 #include "base/timer.h"
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 938ec679f68..10fc00da681 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -22,15 +22,7 @@
 
 #include "cudamatrix/cu-common.h"
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-
-#include "hipify.h"
-#define API_NAME_PREFIX "HIP"
-#else
 #include <cuda.h>
-#define API_NAME_PREFIX "CU"
-#endif
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-matrixdim.h"
@@ -39,9 +31,6 @@ namespace kaldi {
 
 #ifdef USE_NVTX
 NvtxTracer::NvtxTracer(const char* name) {
-#ifdef __IS_HIP_COMPILE__
-  roctxRangePushA(name);
-#else
   const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
   const int num_colors = sizeof(colors)/sizeof(uint32_t);
   int color_id = ((int)name[0])%num_colors;
@@ -54,14 +43,9 @@ NvtxTracer::NvtxTracer(const char* name) {
 	eventAttrib.message.ascii = name;
 	nvtxRangePushEx(&eventAttrib);
   // nvtxRangePushA(name);
-#endif
 }
 NvtxTracer::~NvtxTracer() {
-#ifdef __IS_HIP_COMPILE__
-  roctxRangePop();
-#else
-        nvtxRangePop();
-#endif
+  nvtxRangePop();
 }
 #endif
 
@@ -103,106 +87,61 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
 const char* cublasGetStatusStringK(cublasStatus_t status) {
   // Defined in CUDA include file: cublas.h or cublas_api.h
   switch(status) {
-    case CUBLAS_STATUS_SUCCESS:
-      return API_NAME_PREFIX "BLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return API_NAME_PREFIX "BLAS_STATUS_NOT_INITIALIZED";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return API_NAME_PREFIX "BLAS_STATUS_ALLOC_FAILED";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return API_NAME_PREFIX "BLAS_STATUS_INVALID_VALUE";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return API_NAME_PREFIX "BLAS_STATUS_ARCH_MISMATCH";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return API_NAME_PREFIX "BLAS_STATUS_MAPPING_ERROR";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return API_NAME_PREFIX "BLAS_STATUS_EXECUTION_FAILED";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return API_NAME_PREFIX "BLAS_STATUS_INTERNAL_ERROR";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return API_NAME_PREFIX "BLAS_STATUS_NOT_SUPPORTED";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return API_NAME_PREFIX "BLAS_STATUS_LICENSE_ERROR";
-#ifdef __IS_HIP_COMPILE__
-    case HIPBLAS_STATUS_HANDLE_IS_NULLPTR:
-      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
-    case HIPBLAS_STATUS_INVALID_ENUM:
-      return API_NAME_PREFIX "BLAS_STATUS_HANDLE_IS_NULLPTR";
-#endif
+    case CUBLAS_STATUS_SUCCESS:           return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:   return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:      return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:     return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:     return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:     return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:  return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:    return "CUBLAS_STATUS_INTERNAL_ERROR";
+    case CUBLAS_STATUS_NOT_SUPPORTED:     return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_LICENSE_ERROR:     return "CUBLAS_STATUS_LICENSE_ERROR";
   }
-  return API_NAME_PREFIX "BLAS_STATUS_UNKNOWN_ERROR";
+  return "CUBLAS_STATUS_UNKNOWN_ERROR";
 }
 
 const char* cusparseGetStatusString(cusparseStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/cusparse/index.html#cusparsestatust
   // Defined in CUDA include file: cusparse.h
   switch(status) {
-    case CUSPARSE_STATUS_SUCCESS:
-      return API_NAME_PREFIX "SPARSE_STATUS_SUCCESS";
-    case CUSPARSE_STATUS_NOT_INITIALIZED:
-      return API_NAME_PREFIX "SPARSE_STATUS_NOT_INITIALIZED";
-    case CUSPARSE_STATUS_ALLOC_FAILED:
-      return API_NAME_PREFIX "SPARSE_STATUS_ALLOC_FAILED";
-    case CUSPARSE_STATUS_INVALID_VALUE:
-      return API_NAME_PREFIX "SPARSE_STATUS_INVALID_VALUE";
-    case CUSPARSE_STATUS_ARCH_MISMATCH:
-      return API_NAME_PREFIX "SPARSE_STATUS_ARCH_MISMATCH";
-    case CUSPARSE_STATUS_MAPPING_ERROR:
-      return API_NAME_PREFIX "SPARSE_STATUS_MAPPING_ERROR";
-    case CUSPARSE_STATUS_EXECUTION_FAILED:
-      return API_NAME_PREFIX "SPARSE_STATUS_EXECUTION_FAILED";
-    case CUSPARSE_STATUS_INTERNAL_ERROR:
-      return API_NAME_PREFIX "SPARSE_STATUS_INTERNAL_ERROR";
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return API_NAME_PREFIX "SPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_ZERO_PIVOT:
-      return API_NAME_PREFIX "SPARSE_STATUS_ZERO_PIVOT";
-#if CUDA_VERSION >= 11000
-    case CUSPARSE_STATUS_NOT_SUPPORTED:
-      return API_NAME_PREFIX "SPARSE_STATUS_NOT_SUPPORTED";
-    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
-      return API_NAME_PREFIX "SPARSE_STATUS_INSUFFICIENT_RESOURCES";
-#endif
+    case CUSPARSE_STATUS_SUCCESS:                   return "CUSPARSE_STATUS_SUCCESS";
+    case CUSPARSE_STATUS_NOT_INITIALIZED:           return "CUSPARSE_STATUS_NOT_INITIALIZED";
+    case CUSPARSE_STATUS_ALLOC_FAILED:              return "CUSPARSE_STATUS_ALLOC_FAILED";
+    case CUSPARSE_STATUS_INVALID_VALUE:             return "CUSPARSE_STATUS_INVALID_VALUE";
+    case CUSPARSE_STATUS_ARCH_MISMATCH:             return "CUSPARSE_STATUS_ARCH_MISMATCH";
+    case CUSPARSE_STATUS_MAPPING_ERROR:             return "CUSPARSE_STATUS_MAPPING_ERROR";
+    case CUSPARSE_STATUS_EXECUTION_FAILED:          return "CUSPARSE_STATUS_EXECUTION_FAILED";
+    case CUSPARSE_STATUS_INTERNAL_ERROR:            return "CUSPARSE_STATUS_INTERNAL_ERROR";
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_ZERO_PIVOT:                return "CUSPARSE_STATUS_ZERO_PIVOT";
+    #if CUDA_VERSION >= 11000
+    case CUSPARSE_STATUS_NOT_SUPPORTED:             return "CUSPARSE_STATUS_NOT_SUPPORTED";
+    case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:    return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
+    #endif
   }
-  return API_NAME_PREFIX "SPARSE_STATUS_UNKNOWN_ERROR";
+  return "CUSPARSE_STATUS_UNKNOWN_ERROR";
 }
 
 const char* curandGetStatusString(curandStatus_t status) {
   // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
   // Defined in CUDA include file: curand.h
   switch(status) {
-    case CURAND_STATUS_SUCCESS:
-      return API_NAME_PREFIX "RAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return API_NAME_PREFIX "RAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return API_NAME_PREFIX "RAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return API_NAME_PREFIX "RAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:
-      return API_NAME_PREFIX "RAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return API_NAME_PREFIX "RAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return API_NAME_PREFIX "RAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return API_NAME_PREFIX "RAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return API_NAME_PREFIX "RAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return API_NAME_PREFIX "RAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return API_NAME_PREFIX "RAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return API_NAME_PREFIX "RAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return API_NAME_PREFIX "RAND_STATUS_INTERNAL_ERROR";
-#ifdef __IS_HIP_COMPILE__
-    case HIPRAND_STATUS_NOT_IMPLEMENTED:
-      return API_NAME_PREFIX "RAND_STATUS_NOT_IMPLEMENTED";
-#endif
+    case CURAND_STATUS_SUCCESS:                     return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:            return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:             return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:           return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:                  return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:                return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:              return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:         return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:       return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:               return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:              return "CURAND_STATUS_INTERNAL_ERROR";
   }
-  return API_NAME_PREFIX "RAND_STATUS_UNKNOWN_ERROR";
+  return "CURAND_STATUS_UNKNOWN_ERROR";
 }
 
 }  // namespace kaldi
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index f7f45b8043a..83f8a39a8b9 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -31,25 +31,11 @@
 
 #if HAVE_CUDA
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
-#include <roctracer/roctx.h>
-
-#include "hipify.h"
-#else
 #include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-#include <nvtx3/nvToolsExt.h>
-
-#define GPU_WARP_SIZE 32
-#define GPU_MAX_THREADS_PER_BLOCK 1024
-#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
-#endif
+#include <nvToolsExt.h>
 
 #define CU_SAFE_CALL(fun) \
 { \
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index bb4017de9bb..be02921169d 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -19,16 +19,9 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index fd2c0c64f1f..39bcf373ace 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,17 +23,10 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#endif  // __IS_HIP_COMPILE__
+
 #include <string>
 #include <vector>
 #include <algorithm>
@@ -247,12 +240,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Suggest to use compute exclusive mode
-#ifdef __IS_HIP_COMPILE__
-    KALDI_WARN << "Not in compute-exclusive mode.";
-#else
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
-#endif
     // We want to choose the device more carefully, so release the CUDA context.
     e = cudaDeviceReset();
     if (e != cudaSuccess) {
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index fe8ac795560..2f278eb85b9 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -28,27 +28,14 @@
 #include <string>
 #include <iostream>
 
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-#include <hiprand/hiprand.h>
-#include <hipsparse/hipsparse.h>
-
-#include "hipify.h"
-#else
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <curand.h>
 #include <cusparse.h>
-#endif
+
 #if CUDA_VERSION >= 9010
-#ifdef __IS_HIP_COMPILE__
-#include <hipsolver/hipsolver.h>
-#else
 #include <cusolverDn.h>
-#endif
 #else
 // cusolver not supported.
 // Setting a few types to minimize compiler guards.
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index b3c3165bd96..8044ff699bc 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -27,23 +27,11 @@
 
 #include <cfloat>
 #include <limits>
-#ifdef __IS_HIP_COMPILE__
-#define __CUDA_ARCH__ 800
-#include <hip/hip_math_constants.h>
-#include <hip/hip_runtime.h>
-
-#include <hipcub/hipcub.hpp>
-#include <hipcub/block/block_reduce.hpp>
-
-#include "cudamatrix/cu-kernels-ansi.h"
-#include "hipify.h"
-#else
 #include <math_constants.h>
-#include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-kernels-ansi.h"
 #include <cub/block/block_reduce.cuh>
 #include <cuda.h> // for CUDA_VERSION
-#endif            //__IS_HIP_COMPILE__
+
 
 /***********************************************************************
  * Generic __device__ functions
@@ -965,12 +953,11 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   }
 
   // Warp reduce. Implicitly synchronized within a warp.
+  if (tid < warpSize) {
 #   pragma unroll
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-    if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
-    __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1122,8 +1109,8 @@ void trace_mat_mat_trans_atomic(Real *d_result,
                                 cudaStream_t stream) {
   // Assuming *d_result is set to zero already
 
-  constexpr int THREADS_X = GPU_WARP_SIZE;
-  constexpr int THREADS_Y = GPU_MAX_WARPS_PER_BLOCK / 2;
+  constexpr int THREADS_X = 32;
+  constexpr int THREADS_Y = 16;
 
   dim3 thrds(THREADS_X, THREADS_Y);
 
@@ -1180,7 +1167,6 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
       ssum[tid] += ssum[tid + shift];
-      __syncwarp();
     }
   }
 
@@ -1220,12 +1206,11 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
 #   pragma unroll
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-     if (tid < warpSize) {
-       ssum[tid] += ssum[tid + shift];
-     }
-     __syncwarp();
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      ssum[tid] += ssum[tid + shift];
+    }
   }
 
   // output 1 sum per thread block
@@ -1272,13 +1257,12 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      if (tid < warpSize) {
-	ssum[tid] += ssum[tid + shift];
-      }
-      __syncwarp();
+      ssum[tid] += ssum[tid + shift];
     }
+  }
 
   // output TileDim sums per thread block
   if (tid < TileDim) {
@@ -1356,13 +1340,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
 #   pragma unroll
-  for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-    if (tid < warpSize) {
+    for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
-    __syncwarp();
   }
+
   // output TileDim sums per thread block
   if (tid < TileDim && j_n < dim_N.cols) {
     v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
@@ -1809,11 +1793,10 @@ static void _vec_transform_reduce(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-    if (tid < warpSize) {
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
     }
-    __syncwarp();
   }
 
   // Output to vector result.
@@ -2023,11 +2006,9 @@ static void _transform_reduce_mat_rows(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-    if (tid < warpSize) {
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1)
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
-    }
-    __syncwarp();
   }
 
   // Output to vector result.
@@ -2064,13 +2045,11 @@ static void _transform_reduce_mat_cols(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-    if (tid < warpSize) {
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1)
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
-    }    
-    __syncwarp();
   }
-  
+
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2108,12 +2087,13 @@ static void _group_transform_reduce(
       x_idx += threads_per_group;
     }
     sreduction[tid] = treduction;
-    __syncthreads();
+    if (threads_per_group > warpSize) {
+      __syncthreads();
+    }
 
     // tree-reduce to 2x warpSize elements per group
-    int shift = threads_per_group / 2;
-#pragma unroll
-    for (; shift > warpSize; shift >>= 1) {
+#   pragma unroll
+    for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
@@ -2121,12 +2101,14 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
+    // Threads implicitly synchronized within the warp.
+    const int warp_reduce_size =
+        threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
+    if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
-    for (; shift > 0; shift >>= 1) {
-      if (threadIdx.x < shift) {
+      for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
-      __syncwarp();
     }
 
     // Store the result.
@@ -2985,13 +2967,12 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   }
 
   // reduce to 1 element per row
+  if (tid < warpSize) {
 #   pragma unroll
-  for (int shift = warpSize; shift > 0; shift >>= 1) {
-    if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
     }
-    __syncwarp();
   }
 
   // broadcast the sum results
@@ -3273,16 +3254,15 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   }
   // Warp reduce without __syncthreads()
   // (note.: synchronizes implicitly within a warp at the multiprocessor)
+  if (tid < warpSize / 2) {
 #pragma unroll
-  for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
-      num_working_threads >>= 1) {
-    if (tid < warpSize / 2) {
+    for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
+        num_working_threads >>= 1) {
       if (smax[tid + num_working_threads] > smax[tid]) {
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
     }
-    __syncwarp();
   }
 
   if (tid == 0) {
@@ -4010,9 +3990,9 @@ struct  BatchedMatrixCopyDesc {
   MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
 };
 
-// launched with a block size of GPU_MAX_WARPS_PER_BLOCKxGPU_WARP_SIZE
-// (GPU_MAX_WARPS_PER_BLOCK rows, GPU_WARP_SIZE cols per CTA) grid dim x,y
-// expands to fill out average in x/y across batches grid dim.z is batch
+// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
+// grid dim x,y expands to fill out average in x/y across batches
+// grid dim.z is batch
 template<typename Real>
 __global__ 
 void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
@@ -4391,7 +4371,7 @@ void cudaF_trace_mat_mat_trans(const float* A, const float* B,
 
 void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
                          MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
+  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
@@ -4412,11 +4392,6 @@ void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
-#ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64>
-        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
-#endif
   }
 }
 
@@ -4427,11 +4402,7 @@ void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
-#ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
-#endif
+    _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   }
 }
 
@@ -5106,7 +5077,7 @@ void cudaD_trace_mat_mat_trans(const double* A,
 
 void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
                          MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<GPU_WARP_SIZE><<<Gr, Bl>>>(A, B, dA, B_stride, value);
+  _trace_mat_mat<32> <<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
@@ -5127,11 +5098,6 @@ void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
   } else if (Bl.x == 32) {
     _add_diag_mat_mat_MTN<32> <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta,
                                            v, stride_v);
-#ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MTN<64>
-        <<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v, stride_v);
-#endif
   }
 }
 
@@ -5142,11 +5108,7 @@ void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
   if (Bl.x == 16) {
     _add_diag_mat_mat_MN<16> <<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   } else if (Bl.x==32) {
-    _add_diag_mat_mat_MN<32><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
-#ifdef __IS_HIP_COMPILE__
-  } else if (Bl.x == 64) {
-    _add_diag_mat_mat_MN<64><<<Gr, Bl>>>(alpha, M, stride_M, N, dim_N, beta, v);
-#endif
+    _add_diag_mat_mat_MN<32><<<Gr,Bl>>>(alpha,M,stride_M,N,dim_N,beta,v);
   }
 }
 
@@ -5517,25 +5479,25 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
+  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
                                  const float* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
+  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
+  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
                                  const double* mat_in, MatrixDim d_out,
                                  MatrixDim d_in) {
-  _copy_from_mat_trans<GPU_WARP_SIZE><<<Gr, Bl>>>(mat_out, mat_in, d_out, d_in);
+  _copy_from_mat_trans<32> <<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
@@ -5831,15 +5793,7 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
 // Launches a kernel that does nothing, explicitly using the legacy default stream;
 // this will synchronize all threads without blocking.
 void cuda_legacy_noop() {
-#ifdef __IS_HIP_COMPILE__
-  // HIP doesn't currently support cudaStreamLegacy stream so we force the
-  // implementation to use the legacy (not per-thread) API to get similar
-  // semantics.
-  auto k = reinterpret_cast<void*>(_noop_kernel);
-  hipExtLaunchKernel(k, dim3(1), dim3(1), nullptr, 0, 0, 0, 0, 0);
-#else
   _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
-#endif
 }
 
 void cudaF_mat_copy_range_clamped(
@@ -5849,10 +5803,8 @@ void cudaF_mat_copy_range_clamped(
    float *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks(
-      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
 
   _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5865,10 +5817,8 @@ void cudaD_mat_copy_range_clamped(
    double *dst, int32_t ldd) {
 
   int32_t num_rows =  row_end - row_start;
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
-  dim3 blocks(
-      (num_cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-      (num_rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK);
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
 
   _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
       src, lds, clamp_low, clamp_high, dst, ldd);
@@ -5877,7 +5827,8 @@ void cudaD_mat_copy_range_clamped(
 void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
     int32_t *ldo) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+
+  dim3 threads(32,32);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<float> batch_desc; 
@@ -5903,10 +5854,9 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks(
-          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
-          MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5926,11 +5876,10 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
-
-      dim3 blocks(
-          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
-          remaining);
+      
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5943,7 +5892,8 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
 void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
     int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
     int32_t *ldo) {
-  dim3 threads(GPU_WARP_SIZE, GPU_MAX_WARPS_PER_BLOCK);
+
+  dim3 threads(32,32);
   int32_t total_rows=0, total_cols=0;
   
   BatchedMatrixCopyDesc<double> batch_desc; 
@@ -5969,10 +5919,9 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       // compute average number of rows/cols across batch
       int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
       int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
-      dim3 blocks(
-          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
-          MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
 
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
@@ -5993,11 +5942,10 @@ void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
       int32_t rows = ceilf(total_rows / (float)remaining);
       int32_t cols = ceilf(total_cols / (float)remaining);
 
-      dim3 blocks(
-          (cols + GPU_WARP_SIZE - 1) / GPU_WARP_SIZE,
-          (rows + GPU_MAX_WARPS_PER_BLOCK - 1) / GPU_MAX_WARPS_PER_BLOCK,
-          remaining);
-
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
+      
       // no memcpy needed here.  Memory will be passed down directly
       // through paramter passing and live in constant memory
 
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index d0d8e4e771f..3fbeff3a470 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -818,7 +818,7 @@ void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
 
     // Use 2D block (8x32 threads) as we need to compute column sum.
     // Use 1D grid to cover the data matrix width `cell_dim`.
-    const int kWarpSize = GPU_WARP_SIZE;
+    const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
 //    dim3 dimGrid(n_blocks(cell_dim, dimBlock.x),
 //                 n_blocks(num_rows, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index dfcaf30770a..be8483e48f5 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2705,7 +2705,7 @@ static void UnitTestCuMatrixSetRandUniform() {
           upper_bound = expected_moment + allowed_deviation;
       if (!(observed_moment >= lower_bound && observed_moment <= upper_bound)) {
         KALDI_LOG << "Random matrix is " << M;
-        KALDI_ERR << "Bad observed " << pow << "'th moment " << observed_moment
+        KALDI_ERR << "Bad observed " << pow <<  "'th moment " << observed_moment
                   << ", expected " << expected_moment << ", allowed range "
                   << lower_bound << " to " << upper_bound;
       }
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 53831a52bc8..c67842d38bf 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -27,16 +27,9 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
@@ -250,7 +243,7 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       } else {
         // 2D thread block with warps (blockDim.x) along the row-dim of input M.
         // Each (8x32) thread block will transpose (32x32) data
-        const int32 warpSize = GPU_WARP_SIZE;
+        const int32 warpSize = 32;
         dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
         dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
             n_blocks(M.NumRows(), warpSize));
@@ -856,7 +849,7 @@ void CuMatrixBase<Real>::DiffGroupPnorm(const CuMatrixBase<Real> &in_value,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    const int kWarpSize = GPU_WARP_SIZE;
+    const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(NumCols(), dimBlock.x),
                  n_blocks(NumRows(), dimBlock.y));
@@ -1006,7 +999,7 @@ void CuMatrixBase<Real>::AddSmat(Real alpha, const CuSparseMatrix<Real> &A,
     // We use warpSize threads per row to access only the nonzero elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of A.
-    const int warpSize = GPU_WARP_SIZE;
+    const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(A.NumRows(), dimBlock.y));
 
@@ -2183,7 +2176,7 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
     // if the matrix is not in a very bad shape.
     // (wider or taller than 32x8192)
     // CPU will then reduce to 1 element.
-    const int kWarpSize = GPU_WARP_SIZE;
+    const int kWarpSize = 32;
     dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
     dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
         n_blocks(A.NumRows(), kWarpSize));
@@ -2405,7 +2398,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
       // and use transposed copy to fill *this
       // see CuMatrixBase<Real>::CopyFromMat() for more detail of the impl
       MatrixDim rv_dim = { num_cols_, num_rows_, num_rows_ };
-      const int32 warpSize = GPU_WARP_SIZE;
+      const int32 warpSize = 32;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(rv_dim.cols, warpSize),
                    n_blocks(rv_dim.rows, warpSize));
@@ -2415,7 +2408,7 @@ void CuMatrixBase<Real>::CopyColsFromVec(const CuVectorBase<Real> &rv) {
     } else if (rv.Dim() == num_rows_) {
       // use 2D block (8x32) and large enough grid to cover matrix *this
       // dimBlock.x need to be at least warpSize for coalesced memory access.
-      const int32 warpSize = GPU_WARP_SIZE;
+      const int32 warpSize = 32;
       dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
       dim3 dimGrid(n_blocks(num_cols_, dimBlock.x),
                    n_blocks(num_rows_, dimBlock.y));
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 775fecd82c6..3ffe67d8b06 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -250,7 +250,7 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
-
+  
   // This function will copy from source rows (start_range, end_range]
   // if the range is outside of the clamped region then the clamped
   // row will be replicated across the out of range areas
@@ -307,9 +307,9 @@ class CuMatrixBase {
   void PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign=false);
 
   void Floor(const CuMatrixBase<Real> &src, Real floor_val);
-
+  
   void Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val);
-
+  
   /// This is equivalent to running:
   /// Floor(src, lower_limit);
   /// Ceiling(src, upper_limit);
@@ -320,7 +320,7 @@ class CuMatrixBase {
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
   void ExpSpecial(const CuMatrixBase<Real> &src);
-
+  
   /// Softmax nonlinearity
   /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
   /// with attention to avoiding  overflow or underflow.
@@ -333,7 +333,7 @@ class CuMatrixBase {
   /// Supports in-place operation (i.e. this == &src).
   void LogSoftMaxPerRow(const CuMatrixBase<Real> &src);
 
-
+  
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -439,23 +439,23 @@ class CuMatrixBase {
     this -> Pow(*this, power);
   };
 
-
+  
   inline void ApplyPowAbs(Real power, bool include_sign=false) {
     this -> PowAbs(*this, power, include_sign);
   };
-
+  
   inline void ApplyHeaviside() {
     this -> Heaviside(*this);
   };
-
+  
   inline void ApplyFloor(Real floor_val) {
     this -> Floor(*this, floor_val);
   };
-
+  
   inline void ApplyCeiling(Real ceiling_val) {
     this -> Ceiling(*this, ceiling_val);
   };
-
+  
   inline void ApplyExp() {
     this -> Exp(*this);
   };
@@ -924,7 +924,7 @@ class CuSubMatrix: public CuMatrixBase<Real> {
 
   /// This type of constructor is needed for Range() to work [in CuMatrix base
   /// class]. Cannot make it explicit or that breaks.
-  inline CuSubMatrix(const CuSubMatrix &other):
+  inline CuSubMatrix<Real> (const CuSubMatrix &other):
   CuMatrixBase<Real> (other.data_, other.num_rows_, other.num_cols_,
                       other.stride_) {}
  private:
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 001170fdeca..756d580c7cf 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -21,16 +21,9 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index 96085848d72..d1efc0cff9c 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -19,16 +19,9 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 0c2230a8731..aad34b5dd54 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
 template <typename Real>
 static void UnitTestCuSparseMatrixTraceMatSmat() {
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT row = 2 + Rand() % 3;
-    MatrixIndexT col = 1 + Rand() % 4;
+    MatrixIndexT row = 10 + Rand() % 40;
+    MatrixIndexT col = 10 + Rand() % 50;
 
     CuMatrix<Real> mat1(row, col);
     CuMatrix<Real> mat2(col, row);
@@ -147,13 +147,11 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
     cu_smat2.CopyToMat(&mat2);
 
     Real trace1 = TraceMatMat(mat3, mat1, kTrans);
-
     Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
     AssertEqual(trace1, trace2, 0.00001);
 
     trace1 = TraceMatMat(mat3, mat2, kNoTrans);
     trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);
-
     AssertEqual(trace1, trace2, 0.00001);
   }
 }
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 81ecbe68080..703aa40e735 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -22,16 +22,9 @@
 
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include <utility>
 #include <vector>
@@ -145,7 +138,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all selected rows.
-    const int warpSize = GPU_WARP_SIZE;
+    const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(row_indexes.Dim(), dimBlock.y));
 
@@ -168,7 +161,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
 template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
                                      MatrixTransposeType trans) :
-  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
     NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -201,8 +194,8 @@ template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
                                      const CuVectorBase<Real> &weights,
                                      int32 dim, MatrixTransposeType trans) :
-  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), 
-  csr_val_(NULL) {
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
@@ -273,9 +266,8 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = 0;
       num_cols_ = 0;
       nnz_ = 0;
-      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
           1 * sizeof(int)));
-      csr_col_idx_ = NULL;   // may be freed, but this is allowed.
       csr_val_ = NULL;
     } else {
       KALDI_ASSERT(num_rows > 0);
@@ -285,16 +277,10 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = num_rows;
       num_cols_ = num_cols;
       nnz_ = nnz;
-      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
-      if (nnz > 0) {
-	csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
-          nnz * sizeof(int)));
-	csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+          (num_rows + 1 + nnz) * sizeof(int)));
+      csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
           nnz * sizeof(Real)));
-      } else {
-	csr_col_idx_ = NULL;
-	csr_val_ = NULL;
-      }
       CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
       row_ptr.Set(nnz);
       if (resize_type == kSetZero) {
@@ -316,11 +302,8 @@ void CuSparseMatrix<Real>::Destroy() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    if (csr_row_ptr_) {
-      CuDevice::Instantiate().Free(csr_row_ptr_);
-    }
-    if (csr_col_idx_) {
-      CuDevice::Instantiate().Free(csr_col_idx_);
+    if (csr_row_ptr_col_idx_) {
+      CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
     }
     if (csr_val_) {
       CuDevice::Instantiate().Free(csr_val_);
@@ -328,8 +311,7 @@ void CuSparseMatrix<Real>::Destroy() {
     num_rows_ = 0;
     num_cols_ = 0;
     nnz_ = 0;
-    csr_row_ptr_ = NULL;
-    csr_col_idx_ = NULL;    
+    csr_row_ptr_col_idx_ = NULL;
     csr_val_ = NULL;
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -396,17 +378,11 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
       CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
       val_to.CopyFromVec(val_from);
 
-      {
-	CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
-	CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
-	idx_to.CopyFromArray(idx_from);
-      }
-
-      {
-	CuSubArray<int> idx_to(csr_col_idx_, NumElements());
-	CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
-	idx_to.CopyFromArray(idx_from);
-      }
+      CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
+                             NumRows() + 1 + NumElements());
+      CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
+                               smat.NumRows() + 1 + smat.NumElements());
+      idx_to.CopyFromArray(idx_from);
 
     } else {
       Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
@@ -437,14 +413,9 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
       smat->Resize(0, 0);
       return;
     }
-    CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
-    std::vector<int> row_ptr_cpu;
-    row_ptr.CopyToVec(&row_ptr_cpu);
-
-
-    CuSubArray<int> col_idx(csr_col_idx_, NumElements());
-    std::vector<int> col_idx_cpu;
-    col_idx.CopyToVec(&col_idx_cpu);
+    CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
+    std::vector<int> idx_cpu;
+    idx.CopyToVec(&idx_cpu);
 
     CuSubVector<Real> val(CsrVal(), NumElements());
     Vector<OtherReal> val_cpu(NumElements(), kUndefined);
@@ -454,8 +425,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
         NumRows());
     int n = 0;
     for (int i = 0; i < NumRows(); ++i) {
-      for (; n < row_ptr_cpu[i + 1]; ++n) {
-        const MatrixIndexT j = col_idx_cpu[n];
+      for (; n < idx_cpu[i + 1]; ++n) {
+        const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
         pairs[i].push_back( { j, val_cpu(n) });
       }
     }
@@ -513,8 +484,7 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
     std::swap(num_rows_, smat->num_rows_);
     std::swap(num_cols_, smat->num_cols_);
     std::swap(nnz_, smat->nnz_);
-    std::swap(csr_row_ptr_, smat->csr_row_ptr_);
-    std::swap(csr_col_idx_, smat->csr_col_idx_);
+    std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
     std::swap(csr_val_, smat->csr_val_);
   } else
 #endif
@@ -578,7 +548,7 @@ Real TraceMatSmat(const CuMatrixBase<Real> &A,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows of B.
-    const int warpSize = GPU_WARP_SIZE;
+    const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(B.NumRows(), dimBlock.y));
 
@@ -668,7 +638,7 @@ void CuSparseMatrix<Real>::CopyToMat(CuMatrixBase<OtherReal> *M,
     // We use warpSize threads per row to access only the nnz elements.
     // Every CU1DBLOCK/warpSize rows share one thread block.
     // 1D grid to cover all rows.
-    const int warpSize = GPU_WARP_SIZE;
+    const int warpSize = 32;
     dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
     dim3 dimGrid(n_blocks(NumRows(), dimBlock.y));
 
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 180beed6183..82b17a0dc71 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,13 +121,13 @@ class CuSparseMatrix {
 
   /// Default constructor
   CuSparseMatrix() :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
+      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
           NULL) {
   }
 
   /// Constructor from CPU-based sparse matrix.
   explicit CuSparseMatrix(const SparseMatrix<Real> &smat) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
+      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat);
   }
@@ -135,7 +135,7 @@ class CuSparseMatrix {
   /// Constructor from GPU-based sparse matrix (supports transposition).
   CuSparseMatrix(const CuSparseMatrix<Real> &smat, MatrixTransposeType trans =
                      kNoTrans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
+      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat, trans);
   }
@@ -200,19 +200,19 @@ class CuSparseMatrix {
   /// indices of the first nonzero element in the i-th row, while the last entry
   /// contains nnz_, as zero-based CSR format is used.
   const int* CsrRowPtr() const {
-    return csr_row_ptr_;
+    return csr_row_ptr_col_idx_;
   }
   int* CsrRowPtr() {
-    return csr_row_ptr_;
+    return csr_row_ptr_col_idx_;
   }
 
   /// Returns pointer to the integer array of length nnz_ that contains
   /// the column indices of the corresponding elements in array CsrVal()
   const int* CsrColIdx() const {
-    return csr_col_idx_;
+    return csr_row_ptr_col_idx_ + num_rows_ + 1;
   }
   int* CsrColIdx() {
-    return csr_col_idx_;
+    return csr_row_ptr_col_idx_ + num_rows_ + 1;
   }
 
 private:
@@ -238,10 +238,9 @@ class CuSparseMatrix {
   // number of non-zeros
   MatrixIndexT nnz_;
 
-  // length num_rows_ + 1
-  int* csr_row_ptr_;
-  // length nnz_
-  int* csr_col_idx_;
+  // csr row ptrs and col indices in a single int array
+  // of the length (num_rows_ + 1 + nnz_)
+  int* csr_row_ptr_col_idx_;
 
   // csr value array of the length nnz_
   Real* csr_val_;
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index da19a31b39a..377c34239f0 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -19,16 +19,9 @@
 // limitations under the License.
 
 #if HAVE_CUDA==1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h
index 4219467f615..8de46ec46f5 100644
--- a/src/cudamatrix/cu-tp-matrix.h
+++ b/src/cudamatrix/cu-tp-matrix.h
@@ -48,18 +48,18 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   CuTpMatrix() : CuPackedMatrix<Real>() {}
   explicit CuTpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
       : CuPackedMatrix<Real>(r, resize_type) {}
-
-  explicit CuTpMatrix(const TpMatrix<Real> &orig)
+  
+  explicit CuTpMatrix<Real>(const TpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
   // This constructor lacks the "explicit" keyword so that
   // we can include this class in std::vector.
-  CuTpMatrix(const CuTpMatrix<Real> &orig)
+  CuTpMatrix<Real>(const CuTpMatrix<Real> &orig)
       : CuPackedMatrix<Real>(orig) {}
-
-  explicit CuTpMatrix(const CuMatrixBase<Real> &orig,
+  
+  explicit CuTpMatrix<Real>(const CuMatrixBase<Real> &orig,
                             MatrixTransposeType trans = kNoTrans);
 
-
+  
   ~CuTpMatrix() {}
 
   void CopyFromMat(const CuMatrixBase<Real> &M,
@@ -70,12 +70,12 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   }
   void CopyFromTp(const TpMatrix<Real> &other) {
     CuPackedMatrix<Real>::CopyFromPacked(other);
-  }
+  }  
   void Cholesky(const CuSpMatrix<Real>& Orig);
   void Invert();
 
   CuTpMatrix<Real> &operator = (const CuTpMatrix<Real> &in);
-
+  
  protected:
   inline const TpMatrix<Real> &Mat() const {
     return *(reinterpret_cast<const TpMatrix<Real>* >(this));
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 6667f2bca62..8736782a3e0 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -22,16 +22,9 @@
 // limitations under the License.
 
 #if HAVE_CUDA == 1
-#ifdef __IS_HIP_COMPILE__
-#include <hip/hip_runtime_api.h>
-#include <hipblas/hipblas.h>
-
-#include "hipify.h"
-#else
 #include <cuda_runtime_api.h>
 #include <cublas_v2.h>
 #endif
-#endif
 
 #include "base/timer.h"
 #include "cudamatrix/cu-common.h"
@@ -636,10 +629,7 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
                                   N.Data(), N.Stride(), beta, data_);
       } else {
         // Case 2: diag(M'*N) == sum(M.*N, 1)
-        // (2*CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE/2
-        // or
-        // (CU1DBLOCK/GPU_WARP_SIZE)xGPU_WARP_SIZE
-        // 2D block for coalesced memory access.
+        // 16x16 or 8x32 2D block for coalesced memory access.
         // Grid shape is designed as follows,
         // 1. for small matrices, use 1D grid with only 1 row of 16x16 block,
         //    to avoid multiple kernel launch;
@@ -647,12 +637,11 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         //    use 1- or 2-D grid so that the grid contains
         //    at least and not much larger than 'kOptNumBlocks' blocks
         //    to fully utilize the GPU;
-        const int32 warpSize = GPU_WARP_SIZE;
+        const int32 warpSize = 32;
         const int32 kOptNumBlocks = 512;
         const int32 tile_dim =
-            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize)
-                ? GPU_WARP_SIZE / 2
-                : GPU_WARP_SIZE;
+            (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ?
+                16 : 32;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x),
                      n_blocks(N.NumRows(), dimBlock.y));
@@ -678,9 +667,8 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
         // 16x16 or 8x32 2D block for matrix transpose and coalesced memory access.
         // One block per 'tile_dim' columns of N.
         // 1D grid expands along the row of N.
-        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
-                           ? GPU_WARP_SIZE
-                           : GPU_WARP_SIZE / 2;
+        int tile_dim =
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(N.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, M.Data(), M.Stride(),
@@ -688,9 +676,8 @@ void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
       } else {
         // Case 4: diag(M'*N') == sum(N'.*M, 1)
         // Same kernel and config as case 3 except M and N are swapped.
-        int tile_dim = sizeof(Real) == sizeof(float) && N.NumCols() >= 2048
-                           ? GPU_WARP_SIZE
-                           : GPU_WARP_SIZE / 2;
+        int tile_dim =
+            sizeof(Real) == sizeof(float) && N.NumCols() >= 2048 ? 32 : 16;
         dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim);
         dim3 dimGrid(n_blocks(M.NumCols(), tile_dim));
         cuda_add_diag_mat_mat_MN(dimGrid, dimBlock, alpha, N.Data(), N.Stride(),
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 82e1fb47fcb..f1c32756887 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -243,7 +243,7 @@ class CuVectorBase {
 
   /// Default constructor: make it protected so the user cannot
   /// instantiate this class.
-  CuVectorBase(): data_(NULL), dim_(0) { }
+  CuVectorBase<Real>(): data_(NULL), dim_(0) { }
 
   Real *data_; ///< GPU data pointer (or regular data pointer
                ///< if CUDA is not compiled in or we have no GPU).
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index 537cca9b97f..63dbe630568 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -28,18 +28,14 @@
 namespace kaldi {
 #if HAVE_CUDA == 1
 
-#ifndef CUBLAS_R_32F
-#define CUBLAS_R_32F CUDA_R_32F
-#endif
 inline cublasStatus_t cublas_gemm(
     cublasHandle_t handle, cublasOperation_t transa,
     cublasOperation_t transb, int m, int n,int k, float alpha,
     const float *A, int lda, const float *B, int ldb, float beta,
     float *C, int ldc) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmEx(handle, transa, transb, m, n, k, &alpha, A, CUBLAS_R_32F,
-                      lda, B, CUBLAS_R_32F, ldb, &beta, C, CUBLAS_R_32F, ldc,
-                      CuDevice::Instantiate().GetCublasComputeType(),
+  return cublasGemmEx(handle,transa,transb,m,n,k,&alpha,A,CUDA_R_32F,lda,B,CUDA_R_32F,ldb,&beta,
+                      C,CUDA_R_32F,ldc,CuDevice::Instantiate().GetCublasComputeType(),
                       CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
@@ -67,11 +63,9 @@ inline cublasStatus_t cublas_gemmBatched(
     const float *A[], int lda, const float *B[], int ldb, float beta,
     float *C[], int ldc, int batchCount) {
 #if CUDA_VERSION >= 11000
-  return cublasGemmBatchedEx(
-      handle, transa, transb, m, n, k, &alpha, (const void **)A, CUBLAS_R_32F,
-      lda, (const void **)B, CUBLAS_R_32F, ldb, &beta, (void **)C, CUBLAS_R_32F,
-      ldc, batchCount, CuDevice::Instantiate().GetCublasComputeType(),
-      CuDevice::Instantiate().GetCublasGemmAlgo());
+  return cublasGemmBatchedEx(handle, transa, transb, m, n, k, &alpha, (const void**)A, CUDA_R_32F,  lda,
+                             (const void**)B, CUDA_R_32F, ldb, &beta, (void**)C, CUDA_R_32F, ldc, batchCount,
+                             CuDevice::Instantiate().GetCublasComputeType(), CuDevice::Instantiate().GetCublasGemmAlgo());
 #else
   return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount);
 #endif
@@ -225,7 +219,6 @@ inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo,
 // cuSPARSE wrappers
 //
 #if CUDA_VERSION >= 10020
-#ifndef __IS_HIP_COMPILE__
 inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int nnz, const void *csrVal,
                                          const int *csrRowPtr,
@@ -250,7 +243,6 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
 
   return status;
 }
-#endif
 
 inline cusparseStatus_t cusparse_csrmm2(cusparseHandle_t handle,
                                        cusparseOperation_t transA, 
@@ -327,7 +319,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
+#if CUDA_VERSION >= 10020
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_32F, copyValues,
 			  idxBase);
@@ -344,7 +336,7 @@ inline cusparseStatus_t cusparse_csr2csc(cusparseHandle_t handle, int m, int n,
                                          int *cscRowInd, int *cscColPtr,
                                          cusparseAction_t copyValues,
                                          cusparseIndexBase_t idxBase) {
-#if CUDA_VERSION >= 10020 && !defined(__IS_HIP_COMPILE__)
+#if CUDA_VERSION >= 10020
   return cusparse_csr2csc(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
                           cscVal, cscRowInd, cscColPtr, CUDA_R_64F, copyValues,
                           idxBase);
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 9f50739a03d..87841799fe7 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -123,7 +123,7 @@ class LatticeBiglmFasterDecoder {
     if (!GetRawLattice(&fst, use_final_probs)) return false;
     // std::cout << "Raw lattice is:\n";
     // fst::FstPrinter<LatticeArc> fstprinter(fst, NULL, NULL, NULL, false, true);
-    // fstprinter.Print(&std::cout, "standard output");
+    // fstprinter.Print(std::cout, "standard output");
     ShortestPath(fst, ofst);
     return true;
   }
diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc
index 87378f93bbd..cc8712e854d 100644
--- a/src/decoder/lattice-simple-decoder.cc
+++ b/src/decoder/lattice-simple-decoder.cc
@@ -45,8 +45,8 @@ void LatticeSimpleDecoder::InitDecoding() {
 
 bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
   InitDecoding();
-
-  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
+  
+  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {  
     if (NumFramesDecoded() % config_.prune_interval == 0)
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     ProcessEmitting(decodable);
@@ -57,7 +57,7 @@ bool LatticeSimpleDecoder::Decode(DecodableInterface *decodable) {
     ProcessNonemitting();
   }
   FinalizeDecoding();
-
+  
   // Returns true if we have any kind of traceback available (not necessarily
   // to the end state; query ReachedFinal() for that).
   return !final_costs_.empty();
@@ -88,9 +88,9 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   if (decoding_finalized_ && !use_final_probs)
     KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
               << "GetRawLattice() with use_final_probs == false";
-
+  
   unordered_map<Token*, BaseFloat> final_costs_local;
-
+  
   const unordered_map<Token*, BaseFloat> &final_costs =
       (decoding_finalized_ ? final_costs_ : final_costs_local);
 
@@ -100,7 +100,7 @@ bool LatticeSimpleDecoder::GetRawLattice(Lattice *ofst,
   ofst->DeleteStates();
   int32 num_frames = NumFramesDecoded();
   KALDI_ASSERT(num_frames > 0);
-  const int32 bucket_count = num_toks_/2 + 3;
+  const int32 bucket_count = num_toks_/2 + 3;  
   unordered_map<Token*, StateId> tok_map(bucket_count);
   // First create all states.
   for (int32 f = 0; f <= num_frames; f++) {
@@ -169,10 +169,10 @@ bool LatticeSimpleDecoder::GetLattice(
   fst::ILabelCompare<LatticeArc> ilabel_comp;
   ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
   // lattice-determinization more efficient.
-
+    
   fst::DeterminizeLatticePrunedOptions lat_opts;
   lat_opts.max_mem = config_.det_opts.max_mem;
-
+    
   DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
   raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
   Connect(ofst); // Remove unreachable states... there might be
@@ -196,7 +196,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     bool emitting, bool *changed) {
   KALDI_ASSERT(frame < active_toks_.size());
   Token *&toks = active_toks_[frame].toks;
-
+    
   unordered_map<StateId, Token*>::iterator find_iter = cur_toks_.find(state);
   if (find_iter == cur_toks_.end()) { // no such token presently.
     // Create one.
@@ -221,7 +221,7 @@ inline LatticeSimpleDecoder::Token *LatticeSimpleDecoder::FindOrAddToken(
     return tok;
   }
 }
-
+  
 // delta is the amount by which the extra_costs must
 // change before it sets "extra_costs_changed" to true.  If delta is larger,
 // we'll tend to go back less far toward the beginning of the file.
@@ -242,7 +242,7 @@ void LatticeSimpleDecoder::PruneForwardLinks(
       warned_ = true;
     }
   }
-
+    
   bool changed = true;
   while (changed) {
     changed = false;
@@ -300,7 +300,7 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
   BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
   BaseFloat best_cost = infinity,
       best_cost_with_final = infinity;
-
+  
   for (unordered_map<StateId, Token*>::const_iterator iter = cur_toks_.begin();
        iter != cur_toks_.end(); ++iter) {
     StateId state = iter->first;
@@ -336,19 +336,19 @@ void LatticeSimpleDecoder::ComputeFinalCosts(
 // on the final frame.  If there are final tokens active, it uses the final-probs
 // for pruning, otherwise it treats all tokens as final.
 void LatticeSimpleDecoder::PruneForwardLinksFinal() {
-  KALDI_ASSERT(!active_toks_.empty());
+  KALDI_ASSERT(!active_toks_.empty());  
   int32 frame_plus_one = active_toks_.size() - 1;
 
   if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen.
     KALDI_WARN << "No tokens alive at end of file\n";
 
-  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
+  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;  
   ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
   decoding_finalized_ = true;
   // We're about to delete some of the tokens active on the final frame, so we
   // clear cur_toks_ because otherwise it would then contain dangling pointers.
   cur_toks_.clear();
-
+  
   // Now go through tokens on this frame, pruning forward links...  may have to
   // iterate a few times until there is no more change, because the list is not
   // in topological order.  This is a modified version of the code in
@@ -429,7 +429,7 @@ BaseFloat LatticeSimpleDecoder::FinalRelativeCost() const {
     return final_relative_cost_;
   }
 }
-
+  
 // Prune away any tokens on this frame that have no forward links. [we don't do
 // this in PruneForwardLinks because it would give us a problem with dangling
 // pointers].
@@ -453,14 +453,14 @@ void LatticeSimpleDecoder::PruneTokensForFrame(int32 frame) {
     }
   }
 }
-
+  
 // Go backwards through still-alive tokens, pruning them, starting not from
 // the current frame (where we want to keep all tokens) but from the frame before
 // that.  We go backwards through the frames and stop when we reach a point
 // where the delta-costs are not changing (and the delta controls when we consider
 // a cost to have "not changed").
 void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
-  int32 cur_frame_plus_one = NumFramesDecoded();
+  int32 cur_frame_plus_one = NumFramesDecoded();  
   int32 num_toks_begin = num_toks_;
   // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
   // one to get the corresponding index for the decodable object.
@@ -468,7 +468,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
     // Reason why we need to prune forward links in this situation:
     // (1) we have never pruned them
     // (2) we never pruned the forward links on the next frame, which
-    //
+    //     
     if (active_toks_[f].must_prune_forward_links) {
       bool extra_costs_changed = false, links_pruned = false;
       PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
@@ -478,7 +478,7 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
         active_toks_[f].must_prune_tokens = true;
       active_toks_[f].must_prune_forward_links = false;
     }
-    if (f+1 < cur_frame_plus_one &&
+    if (f+1 < cur_frame_plus_one && 
         active_toks_[f+1].must_prune_tokens) {
       PruneTokensForFrame(f+1);
       active_toks_[f+1].must_prune_tokens = false;
@@ -493,20 +493,20 @@ void LatticeSimpleDecoder::PruneActiveTokens(BaseFloat delta) {
 // (optionally) on the final frame.  Takes into account the final-prob of
 // tokens.  This function used to be called PruneActiveTokensFinal().
 void LatticeSimpleDecoder::FinalizeDecoding() {
-  int32 final_frame_plus_one = NumFramesDecoded();
+  int32 final_frame_plus_one = NumFramesDecoded();  
   int32 num_toks_begin = num_toks_;
   PruneForwardLinksFinal();
-  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
+  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {  
     bool b1, b2; // values not used.
     BaseFloat dontcare = 0.0;
     PruneForwardLinks(f, &b1, &b2, dontcare);
     PruneTokensForFrame(f + 1);
   }
-  PruneTokensForFrame(0);
+  PruneTokensForFrame(0); 
   KALDI_VLOG(3) << "pruned tokens from " << num_toks_begin
                 << " to " << num_toks_;
 }
-
+  
 void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
   int32 frame = active_toks_.size() - 1; // frame is the frame-index
                                          // (zero-based) used to get likelihoods
@@ -538,9 +538,9 @@ void LatticeSimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
         // AddToken adds the next_tok to cur_toks_ (if not already present).
         Token *next_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                          true, NULL);
-
+          
         // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel,
+        tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
                                      graph_cost, ac_cost, tok->links);
       }
     }
@@ -553,7 +553,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
-
+    
   // Processes nonemitting arcs for one frame.  Propagates within
   // cur_toks_.  Note-- this queue structure is is not very optimal as
   // it may cause us to process states unnecessarily (e.g. more than once),
@@ -569,9 +569,15 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
       queue.push_back(state);
     best_cost = std::min(best_cost, iter->second->tot_cost);
   }
-
+  if (queue.empty()) {
+    if (!warned_) {
+      KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
+                << frame;
+      warned_ = true;
+    }
+  }
   BaseFloat cutoff = best_cost + config_.beam;
-
+    
   while (!queue.empty()) {
     StateId state = queue.back();
     queue.pop_back();
@@ -594,10 +600,10 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
           bool changed;
           Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
                                           false, &changed);
-
+          
           tok->links = new ForwardLink(new_tok, 0, arc.olabel,
                                        graph_cost, 0, tok->links);
-
+            
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
           if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0)
@@ -656,3 +662,5 @@ void LatticeSimpleDecoder::PruneCurrentTokens(BaseFloat beam, unordered_map<Stat
 
 
 } // end namespace kaldi.
+
+
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index acf2bb9cfa9..f316e9bfec3 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -428,8 +428,10 @@ void OnlineCmvn::GetFrame(int32 frame,
   if (frozen_state_.NumRows() != 0) {  // the CMVN state has been frozen.
     stats.CopyFromMat(frozen_state_);
   } else {
-    // first get the raw CMVN stats (this involves caching..)
-    this->ComputeStatsForFrame(frame, &stats);
+    if ((*feat)(0) > opts_.min_energy) {
+        // first get the raw CMVN stats (this involves caching..)
+        this->ComputeStatsForFrame(frame, &stats);
+    }
     // now smooth them.
     SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
                           orig_state_.global_cmvn_stats,
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index b9dfcc0171e..b062d4f84e4 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -215,6 +215,7 @@ struct OnlineCmvnOptions {
                            // modulus.
   std::string skip_dims; // Colon-separated list of dimensions to skip normalization
                          // of, e.g. 13:14:15.
+  float min_energy; // Minimum energy (c0 coefficient) to update frame stats
 
   OnlineCmvnOptions():
       cmn_window(600),
@@ -224,7 +225,8 @@ struct OnlineCmvnOptions {
       normalize_variance(false),
       modulus(20),
       ring_buffer_size(20),
-      skip_dims("") { }
+      skip_dims(""),
+      min_energy(50.0f) { }
 
   void Check() const {
     KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
@@ -248,7 +250,9 @@ struct OnlineCmvnOptions {
     po->Register("norm-means", &normalize_mean, "If true, do mean normalization "
                  "(note: you cannot normalize the variance but not the mean)");
     po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of "
-                 "(colon-separated list of integers)");}
+                 "(colon-separated list of integers)");
+    po->Register("cmn-min-energy", &min_energy, "Minimum energy value (c0 coefficient) "
+                 "to update frame stats.");}
 };
 
 
diff --git a/src/fstbin/fsts-project.cc b/src/fstbin/fsts-project.cc
index 015f1431725..d8c8b9d97cd 100644
--- a/src/fstbin/fsts-project.cc
+++ b/src/fstbin/fsts-project.cc
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
       std::string key = fst_reader.Key();
       VectorFst<StdArc> fst(fst_reader.Value());
 
-      Project(&fst, project_output ? PROJECT_OUTPUT : PROJECT_INPUT);
+      Project(&fst, project_output ? fst::ProjectType::OUTPUT : fst::ProjectType::INPUT);
 
       fst_writer.Write(key, fst);
       n_done++;
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 2589c5c344e..16009714c57 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -23,8 +23,6 @@
 #include "util/kaldi-io.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst
 {
 using std::vector;
@@ -198,7 +196,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(*f, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
     }
 
@@ -226,7 +224,7 @@ static void TestContextFst(bool verbose, bool use_matcher) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
         FstPrinter<Arc> fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
     }
 
diff --git a/src/fstext/context-fst.cc b/src/fstext/context-fst.cc
index d382144700d..817cf04cf50 100644
--- a/src/fstext/context-fst.cc
+++ b/src/fstext/context-fst.cc
@@ -279,7 +279,7 @@ void ComposeContext(const vector<int32> &disambig_syms_in,
   if (central_position != context_width-1) {
     AddSubsequentialLoop(subseq_sym, ifst);
     if (project_ifst) {
-      fst::Project(ifst, fst::PROJECT_INPUT);
+      fst::Project(ifst, fst::ProjectType::INPUT);
     }
   }
 
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index ae902021c7d..5e4f1812930 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -22,8 +22,6 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst {
 using std::vector;
 using std::cout;
@@ -96,7 +94,7 @@ template<class Arc> void TestDeterminizeLattice() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -108,7 +106,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -119,7 +117,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       if (kaldi::Rand() % 2 == 1)
         ConvertLattice<Weight, Int>(det_fst, &compact_det_fst, false);
@@ -130,7 +128,7 @@ template<class Arc> void TestDeterminizeLattice() {
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
       assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
@@ -151,14 +149,14 @@ template<class Arc> void TestDeterminizeLattice2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     delete fst;
   }
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index 36c9ba397a6..e9650ca29a6 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -725,7 +725,7 @@ void DeterminizerStar<F>::EpsilonClosure::
 
   {
     // this sorting is based on StateId
-    std::sort(ecinfo_.begin(), ecinfo_.end());
+    sort(ecinfo_.begin(), ecinfo_.end());
 
     output_subset->clear();
 
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index c3fabb8a21e..272774b20aa 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -24,7 +24,6 @@
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
 
-#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -39,7 +38,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     std::cout << "FST before determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     try {
@@ -47,7 +46,7 @@ template<class Arc> void TestDeterminizeGeneral() {
       std::cout << "FST after determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
@@ -102,7 +101,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -110,7 +109,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -123,7 +122,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -139,7 +138,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -158,14 +157,14 @@ template<class Arc>  void TestDeterminize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
     assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
   {
     std::cout <<" printing after determinization [star]\n";
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
     assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
@@ -175,7 +174,7 @@ template<class Arc>  void TestDeterminize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -243,7 +242,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -251,7 +250,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -268,7 +267,7 @@ template<class Arc>  void TestPush() {
   std::cout <<" printing after pushing\n";
   {
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   assert(RandEquivalent(*fst, fst_pushed, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -321,7 +320,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -329,7 +328,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -342,7 +341,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -358,7 +357,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
   VectorFst<Arc> ofst_star;
@@ -371,7 +370,7 @@ template<class Arc>  void TestMinimize() {
   {
     std::cout <<" printing after determinization [baseline]\n";
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -383,7 +382,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
 
@@ -393,7 +392,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
 
@@ -402,7 +401,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after  minimization [in gallic]\n";
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
     printf("Converting gallic back to regular [my approach]\n");
@@ -411,7 +410,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -419,7 +418,7 @@ template<class Arc>  void TestMinimize() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
   }
@@ -432,7 +431,7 @@ template<class Arc>  void TestMinimize() {
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index d58dbfa539c..9f13b8b9695 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -23,7 +23,6 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -80,7 +79,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(&fst);
@@ -88,7 +87,7 @@ template<class Arc> static void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   if (fst.Start() == kNoStateId) return;  // "Connect" made it empty.
diff --git a/src/fstext/fstext-lib.h b/src/fstext/fstext-lib.h
index 03c8e5861dd..bdb8ff730e5 100644
--- a/src/fstext/fstext-lib.h
+++ b/src/fstext/fstext-lib.h
@@ -20,9 +20,6 @@
 #ifndef KALDI_FSTEXT_FSTEXT_LIB_H_
 #define KALDI_FSTEXT_FSTEXT_LIB_H_
 #include "fst/fstlib.h"
-
-#include "fstext/openfst_compat.h"
-
 #include "fstext/context-fst.h"
 #include "fstext/determinize-star.h"
 #include "fstext/factor.h"
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index fb3a637bc19..7d491a17559 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -151,9 +151,10 @@ template<class Arc, class I>
 LookaheadFst<Arc, I> *LookaheadComposeFst(const Fst<Arc> &ifst1,
                                           const Fst<Arc> &ifst2,
                                           const std::vector<I> &to_remove) {
-  fst::CacheOptions cache_opts(true, 1 << 25LL);
-  fst::CacheOptions cache_opts_map(true, 0);
-  fst::ArcMapFstOptions arcmap_opts(cache_opts);
+  fst::CacheOptions cache_opts(true, 0);
+  fst::CacheOptions cache_opts_map(true, 1 << 26LL);
+  fst::ArcMapFstOptions arcmap_opts(cache_opts_map);
+
   RemoveSomeInputSymbolsMapper<Arc, I> mapper(to_remove);
   return new LookaheadFst<Arc, I>(ComposeFst<Arc>(ifst1, ifst2, cache_opts), mapper, arcmap_opts);
 }
@@ -374,7 +375,6 @@ void GetSymbols(const SymbolTable &symtab,
                 std::vector<I> *syms_out) {
   KALDI_ASSERT(syms_out != NULL);
   syms_out->clear();
-#if OPENFST_VER >= 10800
   for (SymbolTable::iterator iter = symtab.begin();
       iter != symtab.end();
       ++iter) {
@@ -383,16 +383,6 @@ void GetSymbols(const SymbolTable &symtab,
       KALDI_ASSERT(syms_out->back() == iter->Label());  // an integer-range thing.
     }
   }
-#else
-  for (SymbolTableIterator iter(symtab);
-      !iter.Done();
-      iter.Next()) {
-    if (include_eps || iter.Value() != 0) {
-      syms_out->push_back(iter.Value());
-      KALDI_ASSERT(syms_out->back() == iter.Value());  // an integer-range thing.
-    }
-  }
-#endif
 }
 
 template<class Arc>
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 460e49c7dec..4bf72d9868f 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -23,8 +23,6 @@
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst
 {
 using std::vector;
@@ -142,7 +140,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -150,7 +148,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -204,7 +202,7 @@ template<class Arc>  void TestAcceptorMinimize() {
 
   VectorFst<Arc> *fst = RandFst<Arc>();
 
-  Project(fst, PROJECT_INPUT);
+  Project(fst, fst::ProjectType::INPUT);
   RemoveWeights(fst);
 
   VectorFst<Arc> fst2(*fst);
@@ -311,7 +309,7 @@ template<class Arc>  void TestMakeLoopFst() {
   for (int i = 0; i < num_fsts; i++) {
     if (kaldi::Rand() % 2 == 0) {  // put an fst there.
       VectorFst<Arc> *fst = RandFst<Arc>();
-      Project(fst, PROJECT_INPUT);  // make input & output labels the same.
+      Project(fst, fst::ProjectType::INPUT);  // make input & output labels the same.
       fsts[i] = fst;
     } else { // this is to test that it works with the caching.
       fsts[i] = fsts[i/2];
@@ -364,7 +362,7 @@ void TestEqualAlign() {
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
-  printer_print(std::cout, fstprinter, "standard output");
+  fstprinter.Print(std::cout, "standard output");
 }
 
 
@@ -379,7 +377,7 @@ void TestRemoveUselessArcs() {
     RandGenOptions<UniformArcSelector<Arc> > randgen_opts(selector);
     VectorFst<Arc> fst_path;
     RandGen(*fst, &fst_path, randgen_opts);
-    Project(&fst_path, PROJECT_INPUT);
+    Project(&fst_path, fst::ProjectType::INPUT);
     // Print(fst_path, "[testremoveuselessarcs]:fstpath:");
 
     VectorFst<Arc> fst_nouseless(*fst);
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 3baa5b95c9c..f7bb3a7c2b5 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -24,8 +24,6 @@
 
 #include "util/text-utils.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst {
 
 
@@ -46,8 +44,7 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     bool acceptor = false, write_one = false;
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-    //printer.Print(&os, "<unknown>");
-    printer_print(os, printer, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
     // Write another newline as a terminating character.  The read routine will
@@ -102,7 +99,7 @@ void ReadFstKaldi(std::istream &is, bool binary,
     fst->DeleteStates();
     string line;
     size_t nline = 0;
-    string separator = FST_FLAGS_fst_field_separator + "\r\n";
+    string separator = FLAGS_fst_field_separator + "\r\n";
     while (std::getline(is, line)) {
       nline++;
       vector<string> col;
diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc
index 61d6cc74724..626e6508a39 100644
--- a/src/fstext/kaldi-fst-io.cc
+++ b/src/fstext/kaldi-fst-io.cc
@@ -132,7 +132,7 @@ fst::VectorFst<fst::StdArc> *ReadAndPrepareLmFst(std::string rxfilename) {
     // symbol #0 on the input symbols of the backoff arc, and projection will
     // replace them with epsilons which is what is on the output symbols of
     // those arcs.
-    fst::Project(ans, fst::PROJECT_OUTPUT);
+    fst::Project(ans, fst::ProjectType::OUTPUT);
   }
   if (ans->Properties(fst::kILabelSorted, true) == 0) {
     // Make sure LM is sorted on ilabel.
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index 3c34f4b4787..a45920936ec 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -26,7 +26,6 @@
 #include <fst/fst-decl.h>
 #include <fst/script/print-impl.h>
 #include "base/kaldi-common.h"
-#include "fstext/openfst_compat.h"
 
 // Some functions for writing Fsts.
 // I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index 6f1d2747cc1..13b4123db4b 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -21,8 +21,6 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst {
 
 template<class Weight, class Int> void TestConvert(bool invert) {
@@ -33,7 +31,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -41,14 +39,14 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -69,7 +67,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "FST before converting to compact-arc is:\n";
       {
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       VectorFst<CompactArc> cfst;
       ConvertLattice<Weight, Int>(*fst, &cfst, false); // invert == false
@@ -101,9 +99,10 @@ template<class Weight, class Int> void TestShortestPath() {
 
         assert(ApproxEqual(ShortestDistance(nbest_fst_1),
                            ShortestDistance(nbest_fst_1b)));
-        // since semiring is idempotent, this should succeed too.
-        assert(ApproxEqual(ShortestDistance(cfst),
-                           ShortestDistance(nbest_fst_1b)));
+        // since semiring is idempotent, this should succeed too
+        // in theory, but not in practice
+        // assert(ApproxEqual(ShortestDistance(cfst),
+        //                   ShortestDistance(nbest_fst_1b)));
       }
 
       delete fst;
@@ -207,7 +206,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -215,14 +214,14 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
     /*std::cout << "FST after converting is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     /*std::cout << "FST after back conversion is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
 
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
@@ -262,7 +261,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST before converting to compact-arc is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
     VectorFst<CompactArc> ofst;
     ConvertLattice<Weight, Int>(*fst, &ofst, invert);
@@ -270,7 +269,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after converting and scaling is:\n";
     {
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
@@ -278,7 +277,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     /*std::cout << "FST after back conversion and scaling is:\n";
     {
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
       }*/
     // If RandEquivalent doesn't work, it could be due to a nasty issue related to the use
     // of exact floating-point comparisons in the Plus function of LatticeWeight.
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 1396764000a..7637c4d1c55 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -23,7 +23,6 @@
 
 #include "fst/fstlib.h"
 #include "base/kaldi-common.h"
-#include "fstext/openfst_compat.h"
 
 namespace fst {
 
@@ -397,8 +396,8 @@ inline bool ApproxEqual(const LatticeWeightTpl<FloatType> &w1,
 template <class FloatType>
 inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<FloatType> &w) {
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value1());
-  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
-  strm << FST_FLAGS_fst_weight_separator[0]; // comma by default;
+  CHECK(FLAGS_fst_weight_separator.size() == 1);
+  strm << FLAGS_fst_weight_separator[0]; // comma by default;
   // may or may not be settable from Kaldi programs.
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value2());
   return strm;
@@ -406,9 +405,9 @@ inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<Floa
 
 template <class FloatType>
 inline std::istream &operator >>(std::istream &strm, LatticeWeightTpl<FloatType> &w1) {
-  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
+  CHECK(FLAGS_fst_weight_separator.size() == 1);
   // separator defaults to ','
-  return w1.ReadNoParen(strm, FST_FLAGS_fst_weight_separator[0]);
+  return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]);
 }
 
 
@@ -439,9 +438,11 @@ class CompactLatticeWeightTpl {
   CompactLatticeWeightTpl(const WeightType &w, const std::vector<IntType> &s):
       weight_(w), string_(s) { }
 
-  CompactLatticeWeightTpl(const CompactLatticeWeightTpl &compactLatticeWeightTpl) = default;
-
-  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl &w) = default;
+  CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl<WeightType, IntType> &w) {
+    weight_ = w.weight_;
+    string_ = w.string_;
+    return *this;
+  }
 
   const W &Weight() const { return weight_; }
 
@@ -727,8 +728,8 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Divide(const CompactLatticeW
 template <class WeightType, class IntType>
 inline std::ostream &operator <<(std::ostream &strm, const CompactLatticeWeightTpl<WeightType, IntType> &w) {
   strm << w.Weight();
-  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
-  strm << FST_FLAGS_fst_weight_separator[0]; // comma by default.
+  CHECK(FLAGS_fst_weight_separator.size() == 1);
+  strm << FLAGS_fst_weight_separator[0]; // comma by default.
   for(size_t i = 0; i < w.String().size(); i++) {
     strm << w.String()[i];
     if (i+1 < w.String().size())
@@ -744,8 +745,8 @@ inline std::istream &operator >>(std::istream &strm, CompactLatticeWeightTpl<Wei
   if (strm.fail()) {
     return strm;
   }
-  CHECK(FST_FLAGS_fst_weight_separator.size() == 1);
-  size_t pos = s.find_last_of(FST_FLAGS_fst_weight_separator); // normally ","
+  CHECK(FLAGS_fst_weight_separator.size() == 1);
+  size_t pos = s.find_last_of(FLAGS_fst_weight_separator); // normally ","
   if (pos == std::string::npos) {
     strm.clear(std::ios::badbit);
     return strm;
diff --git a/src/fstext/openfst_compat.h b/src/fstext/openfst_compat.h
deleted file mode 100644
index 251d3f893c5..00000000000
--- a/src/fstext/openfst_compat.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef KALDI_FSTEXT_OPENFST_COMPAT_H
-#define KALDI_FSTEXT_OPENFST_COMPAT_H
-
-
-#if OPENFST_VER < 10800
-#define FST_FLAGS_fst_weight_separator FLAGS_fst_weight_separator
-#define FST_FLAGS_fst_field_separator FLAGS_fst_field_separator
-#define FST_FLAGS_v FLAGS_v
-
-#endif
-
-namespace fst {
-#if OPENFST_VER >= 10800
-
-
-template <typename... Args>
-auto Map(Args&&... args) -> decltype(ArcMap(std::forward<Args>(args)...)) {
-  return ArcMap(std::forward<Args>(args)...);
-}
-
-using MapFstOptions=ArcMapFstOptions;
-
-template <class A, class B, class C>
-using MapFst = ArcMapFst<A, B, C>;
-
-template<typename Printer, typename Stream>
-void printer_print(Stream &os, Printer &printer, const std::string &s) {
-  printer.Print(os, s);
-}
-
-#else
-
-template<typename Printer, typename Stream>
-void printer_print(Stream &os, Printer &printer, const std::string &s) {
-  printer.Print(&os, s);
-}
-
-#endif
-
-}  // namespace fst
-
-#endif  //KALDI_FSTEXT_OPENFST_COMPAT_H
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index 45e1a82279a..998fb2997ad 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -235,13 +235,8 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix,
   assert(symTable != NULL);
   const char *prefix_ptr = prefix.c_str();
   size_t prefix_len = strlen(prefix_ptr);  // allowed to be zero but not encouraged.
-#if OPENFST_VER >= 10800
   for (SymbolTable::iterator siter = symTable->begin(); siter != symTable->end(); ++siter) {
     const std::string &sym = siter->Symbol();
-#else
-  for (SymbolTableIterator siter(*symTable); !siter.Done(); siter.Next()) {
-    const std::string &sym = siter.Symbol();
-#endif
     if (!strncmp(prefix_ptr, sym.c_str(), prefix_len)) {  // has prefix.
       if (isdigit(sym[prefix_len])) {  // we don't allow prefix followed by a digit, as a symbol.
         // Has at least one digit.
@@ -416,6 +411,8 @@ void PreDeterminize(MutableFst<Arc> *fst,
   std::vector<bool> d_vec(max_state+1, false);  // "done vector".  Purely for debugging.
 
 
+  size_t num_extra_det_states = 0;
+
   // (D)(v)
   while (Q.size() != 0) {
 
@@ -494,6 +491,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
                 assert(m_map.count(this_pr.first) == 0);
                 m_map[this_pr.first] = k;
                 k++;
+                num_extra_det_states++;
               }
             } else {  // Create the set V_t.
               V_t.insert(this_pr.second);
@@ -691,9 +689,11 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
   typedef typename Arc::Weight Weight;
   assert(fst != NULL);
   StateId num_states = fst->NumStates();
+  StateId num_final = 0;
   std::vector<StateId> final_states;
   for (StateId s = 0; s < num_states; s++) {
     if (fst->Final(s) != Weight::Zero()) {
+      num_final++;
       final_states.push_back(s);
     }
   }
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 60953e40b8d..95ebd62f04f 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -22,7 +22,8 @@
 #include "fstext/fst-test-utils.h"
 #include "fstext/fstext-utils.h"
 
-#include "fstext/openfst_compat.h"
+// Just check that it compiles, for now.
+
 namespace fst
 {
   using std::vector;
@@ -72,7 +73,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -80,7 +81,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
@@ -94,7 +95,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -110,7 +111,7 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -120,14 +121,14 @@ template<class Arc>  void TestPreDeterminize() {
   std::cout <<" printing after determinization\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   std::cout <<" Checking equivalent to original FST.\n";
@@ -179,7 +180,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing before adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -198,7 +199,7 @@ template<class Arc>  void TestAddSelfLoops() {
   std::cout <<" printing after adding self-loops\n";
   {
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   delete fst;
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index f91001fca0d..f27b54f4587 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -22,8 +22,6 @@
 #include "fstext/rand-fst.h"
 #include "fstext/fstext-utils.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst {
 
 static void TestPruneSpecial() {
@@ -40,7 +38,7 @@ static void TestPruneSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
     std::cout << std::endl;
   }
 
@@ -49,7 +47,7 @@ static void TestPruneSpecial() {
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
     std::cout << std::endl;
   }
 
@@ -58,7 +56,7 @@ static void TestPruneSpecial() {
   Prune(*ifst, &ofst2, beam);
   {
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
     std::cout << std::endl;
   }
 
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 9fe8ba63b59..9cf16bb8a84 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -23,8 +23,6 @@
 #include "fstext/fstext-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst
 {
 
@@ -40,7 +38,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> fst_copy(*fst);
@@ -58,7 +56,7 @@ static void TestPushSpecial() {
 
   {
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value();
 
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 1548ac5c726..2e1d3d8cfa1 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -23,7 +23,6 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
 
 namespace fst
 {
@@ -84,7 +83,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<Arc> fst_copy1(fst);
@@ -97,7 +96,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
   {
     std::cout << "copy1 = \n";
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -142,7 +141,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst = \n";
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   VectorFst<StdArc> fst;
@@ -157,7 +156,7 @@ static void TestRemoveEpsLocalSpecial() {
   {
     std::cout << "logfst2 = \n";
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) {
     // make sure we preserved stochasticity in cases where doing so was
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index 1cc8bd02bef..0e8982720d4 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -21,8 +21,6 @@
 #include "fstext/fst-test-utils.h"
 #include "base/kaldi-math.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst{
 
 
@@ -66,13 +64,13 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
   std::cout <<"Table-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   std::cout <<" Baseline-Composed FST\n";
   {
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   if ( !RandEquivalent(composed, composed_baseline, 3/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 20/*path length-- max?*/)) {
@@ -81,7 +79,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff1 (composed - baseline) \n";
     {
       FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
 
@@ -90,7 +88,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     std::cout <<" Diff2 (baseline - composed) \n";
     {
       FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
     assert(0);
@@ -151,7 +149,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
 
@@ -160,7 +158,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
       assert(0);
@@ -221,7 +219,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff1 (composed - baseline) \n";
       {
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
 
@@ -230,7 +228,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       std::cout <<" Diff2 (baseline - composed) \n";
       {
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
       assert(0);
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 9e921920c48..290a4f8bc2e 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -22,7 +22,7 @@
 #include <fst/fstlib.h>
 #include <fst/fst-decl.h>
 
-#include "base/kaldi-types.h"
+
 
 namespace fst {
 
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index 556d194a60d..3045a669362 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -22,8 +22,7 @@
 #include "fstext/determinize-star.h"
 #include "fstext/trivial-factor-weight.h"
 #include "fstext/fst-test-utils.h"
-
-#include "fstext/openfst_compat.h"
+// Just check that it compiles, for now.
 
 namespace fst
 {
@@ -74,7 +73,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing before trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   // Trim resulting FST.
   Connect(fst);
@@ -82,7 +81,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after trimming\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
   vector<Label> extra_syms;
@@ -93,7 +92,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after predeterminization\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
 
 
@@ -109,7 +108,7 @@ template<class Arc>  void TestFactor() {
   std::cout <<" printing after double-epsilon removal\n";
   {
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-    printer_print(std::cout, fstprinter, "standard output");
+    fstprinter.Print(std::cout, "standard output");
   }
   VectorFst<Arc> ofst_star;
 
@@ -128,7 +127,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing gallic FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
 
@@ -140,7 +139,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing factor-weight FST\n";
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
@@ -148,7 +147,7 @@ template<class Arc>  void TestFactor() {
     {
       std::cout <<" printing after converting back to regular FST\n";
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
 
 
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index a8108e6b15c..d1c679f7fda 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -390,11 +390,7 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
 template <class A, class F>
 inline void TrivialFactorWeightFst<A, F>::InitStateIterator(
     StateIteratorData<A> *data) const {
-#if OPENFST_VER >= 10803
-  data->base.reset(new StateIterator< TrivialFactorWeightFst<A, F> >(*this));
-#else
-  data->base = new StateIterator< TrivialFactorWeightFst<A, F> >(*this);
-#endif
+  data->base = fst::make_unique<StateIterator<TrivialFactorWeightFst<A, F> > >(*this);
 }
 
 
diff --git a/src/gmm/mle-diag-gmm-test.cc b/src/gmm/mle-diag-gmm-test.cc
index a91832cd254..d1af7725d20 100644
--- a/src/gmm/mle-diag-gmm-test.cc
+++ b/src/gmm/mle-diag-gmm-test.cc
@@ -139,10 +139,12 @@ void test_flags_driven_update(const DiagGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
+  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    gmm.LogLikelihood(feats.Row(i));
+    loglike0 += static_cast<double>(
+      gmm.LogLikelihood(feats.Row(i)));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -364,8 +366,9 @@ UnitTestEstimateDiagGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
+    float loglike = 0.0;
     for (size_t i = 0; i < counter; i++) {
-      est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
+      loglike += est_gmm.AccumulateFromDiag(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);  // ASCII mode
     test_io(*gmm, est_gmm, true, feats);   // Binary mode
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index 3763943a89b..d41d36489bf 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -93,7 +93,7 @@ struct MapDiagGmmOptions {
   void Register(OptionsItf *opts) {
     opts->Register("mean-tau", &mean_tau,
                    "Tau value for updating means.");
-    opts->Register("variance-tau", &variance_tau,
+    opts->Register("variance-tau", &mean_tau,
                    "Tau value for updating variances (note: only relevant if "
                    "update-flags contains \"v\".");
     opts->Register("weight-tau", &weight_tau,
diff --git a/src/gmm/mle-full-gmm-test.cc b/src/gmm/mle-full-gmm-test.cc
index 26c5460f024..472db88d501 100644
--- a/src/gmm/mle-full-gmm-test.cc
+++ b/src/gmm/mle-full-gmm-test.cc
@@ -200,10 +200,12 @@ void test_flags_driven_update(const FullGmm &gmm,
 
   // now both models gmm_all_update, gmm_all_update have the same params updated
   // compute loglike for models for check
+  double loglike0 = 0.0;
   double loglike1 = 0.0;
   double loglike2 = 0.0;
   for (int32 i = 0; i < feats.NumRows(); i++) {
-    gmm.LogLikelihood(feats.Row(i));
+    loglike0 += static_cast<double>(
+      gmm.LogLikelihood(feats.Row(i)));
     loglike1 += static_cast<double>(
       gmm_all_update.LogLikelihood(feats.Row(i)));
     loglike2 += static_cast<double>(
@@ -460,8 +462,9 @@ UnitTestEstimateFullGmm() {
     est_gmm.Resize(gmm->NumGauss(),
       gmm->Dim(), flags_all);
     est_gmm.SetZero(flags_all);
+    float loglike = 0.0;
     for (int32 i = 0; i < counter; i++) {
-      est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
+      loglike += est_gmm.AccumulateFromFull(*gmm, feats.Row(i), 1.0F);
     }
     test_io(*gmm, est_gmm, false, feats);
     test_io(*gmm, est_gmm, true, feats);
diff --git a/src/gmmbin/gmm-acc-mllt-global.cc b/src/gmmbin/gmm-acc-mllt-global.cc
index b6b7a2b5635..bed91c053d3 100644
--- a/src/gmmbin/gmm-acc-mllt-global.cc
+++ b/src/gmmbin/gmm-acc-mllt-global.cc
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessInt32VectorVectorReader gselect_reader(gselect_rspecifier);
     
-    int32 num_done = 0;
+    int32 num_done = 0, num_err = 0;
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       const Matrix<BaseFloat> &mat = feature_reader.Value();      
@@ -88,6 +88,7 @@ int main(int argc, char *argv[]) {
       } else {
         if (!gselect_reader.HasKey(utt)) {
           KALDI_WARN << "No gselect information for utterance " << utt;
+          num_err++;
           continue;
         }
         const std::vector<std::vector<int32> > &gselect= gselect_reader.Value(utt);
@@ -95,6 +96,7 @@ int main(int argc, char *argv[]) {
           KALDI_WARN << "Gselect information has wrong size for utterance "
                      << utt << ", " << gselect.size() << " vs. "
                      << mat.NumRows();
+          num_err++;
           continue;
         }
         
diff --git a/src/hip/hipify.h b/src/hip/hipify.h
deleted file mode 100644
index 459372e68b8..00000000000
--- a/src/hip/hipify.h
+++ /dev/null
@@ -1,283 +0,0 @@
-#ifndef __HIPIFY_H__
-#define __HIPIFY_H__
-
-#ifdef __HIPCC__
-inline __device__ void __syncwarp(unsigned mask = 0xffffffff) {
-  // On CDNA hardware wave-fronts (warps) execute always in
-  // lock step. Though it might still be important to signal
-  // that the compiler can't reorder code around certain code
-  // sections that rely on data sharing mecanisms like LDS
-  // (shared memory). So this implements a No-op but is seen
-  // by the compiler as having side effects.
-  __asm__("s_nop 0");
-
-  // A saffest option, arguably less performant would be to use:
-  // __asm__("s_waitcnt lgkmcnt(0)"); Í
-  // to explicitly do a memory fence.
-}
-// AMDGCN only support this rounding mode.
-#define __fdiv_rd __fdiv_rn
-#else
-#define __align__(x) __attribute__((aligned(x)))
-#endif
-
-//
-// HIP types
-//
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F \
-  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
-                 // replacements for the tensor cores alternative.
-#define CUBLAS_COMPUTE_32F_FAST_TF32 \
-  HIPBLAS_R_32F  // TODO: Verify that plain float compute are viable
-                 // replacements for the tensor cores alternative.
-#define CUBLAS_DIAG_NON_UNIT HIPBLAS_DIAG_NON_UNIT
-#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER
-#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP \
-  HIPBLAS_GEMM_DEFAULT  // TODO: Verify regular GEMMs are viable replacements
-                        // for explicit tensor GEMMs.
-#define CUBLAS_OP_C HIPBLAS_OP_C
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_R_32F HIPBLAS_R_32F
-#define CUBLAS_R_64F HIPBLAS_R_64F
-#define CUBLAS_SIDE_LEFT HIPBLAS_SIDE_LEFT
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_LICENSE_ERROR HIPBLAS_STATUS_UNKNOWN
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUDA_R_32F HIP_R_32F
-#define CUDA_R_64F HIP_R_64F
-#define CUFFT_R2C HIPFFT_R2C
-#define CUFFT_SUCCESS HIPFFT_SUCCESS
-#define CURAND_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
-#define CURAND_STATUS_ALLOCATION_FAILED HIPRAND_STATUS_ALLOCATION_FAILED
-#define CURAND_STATUS_ARCH_MISMATCH HIPRAND_STATUS_ARCH_MISMATCH
-#define CURAND_STATUS_DOUBLE_PRECISION_REQUIRED \
-  HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INITIALIZATION_FAILED HIPRAND_STATUS_INITIALIZATION_FAILED
-#define CURAND_STATUS_INTERNAL_ERROR HIPRAND_STATUS_INTERNAL_ERROR
-#define CURAND_STATUS_LAUNCH_FAILURE HIPRAND_STATUS_LAUNCH_FAILURE
-#define CURAND_STATUS_LENGTH_NOT_MULTIPLE HIPRAND_STATUS_LENGTH_NOT_MULTIPLE
-#define CURAND_STATUS_NOT_INITIALIZED HIPRAND_STATUS_NOT_INITIALIZED
-#define CURAND_STATUS_OUT_OF_RANGE HIPRAND_STATUS_OUT_OF_RANGE
-#define CURAND_STATUS_PREEXISTING_FAILURE HIPRAND_STATUS_PREEXISTING_FAILURE
-#define CURAND_STATUS_SUCCESS HIPRAND_STATUS_SUCCESS
-#define CURAND_STATUS_TYPE_ERROR HIPRAND_STATUS_TYPE_ERROR
-#define CURAND_STATUS_VERSION_MISMATCH HIPRAND_STATUS_VERSION_MISMATCH
-#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
-#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
-#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
-#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
-#define CUSPARSE_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
-#define CUSPARSE_ORDER_COL HIPSPARSE_ORDER_COLUMN
-#define CUSPARSE_SPMM_CSR_ALG2 HIPSPARSE_SPMM_CSR_ALG2
-#define CUSPARSE_STATUS_ALLOC_FAILED HIPSPARSE_STATUS_ALLOC_FAILED
-#define CUSPARSE_STATUS_ARCH_MISMATCH HIPSPARSE_STATUS_ARCH_MISMATCH
-#define CUSPARSE_STATUS_EXECUTION_FAILED HIPSPARSE_STATUS_EXECUTION_FAILED
-#define CUSPARSE_STATUS_INSUFFICIENT_RESOURCES \
-  HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES
-#define CUSPARSE_STATUS_INTERNAL_ERROR HIPSPARSE_STATUS_INTERNAL_ERROR
-#define CUSPARSE_STATUS_INVALID_VALUE HIPSPARSE_STATUS_INVALID_VALUE
-#define CUSPARSE_STATUS_MAPPING_ERROR HIPSPARSE_STATUS_MAPPING_ERROR
-#define CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED \
-  HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED
-#define CUSPARSE_STATUS_NOT_INITIALIZED HIPSPARSE_STATUS_NOT_INITIALIZED
-#define CUSPARSE_STATUS_NOT_SUPPORTED HIPSPARSE_STATUS_NOT_SUPPORTED
-#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
-#define CUSPARSE_STATUS_ZERO_PIVOT HIPSPARSE_STATUS_ZERO_PIVOT
-#define cuDeviceGetName hipDeviceGetName
-#define cuMemGetInfo_v2 hipMemGetInfo
-#define cublasComputeType_t hipblasDatatype_t
-#define cublasCreate hipblasCreate
-#define cublasDasum_v2 hipblasDasum
-#define cublasDaxpy_v2 hipblasDaxpy
-#define cublasDcopy_v2 hipblasDcopy
-#define cublasDdot_v2 hipblasDdot
-#define cublasDestroy hipblasDestroy
-#define cublasDgemmBatched hipblasDgemmBatched
-#define cublasDgemm_v2 hipblasDgemm
-#define cublasDgemv_v2 hipblasDgemv
-#define cublasDger_v2 hipblasDger
-#define cublasDnrm2_v2 hipblasDnrm2
-#define cublasDscal_v2 hipblasDscal
-#define cublasDspmv_v2 hipblasDspmv
-#define cublasDspr_v2 hipblasDspr
-#define cublasDsyrk_v2 hipblasDsyrk
-#define cublasDtpmv_v2 hipblasDtpmv
-#define cublasDtrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
-  hipblasDtrsm(a, b, c, d, e, f, g, h, const_cast<double*>(i), j, k, l)
-#define cublasFillMode_t hipblasFillMode_t
-#define cublasGemmAlgo_t hipblasGemmAlgo_t
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasOperation_t hipblasOperation_t
-#define cublasSasum_v2 hipblasSasum
-#define cublasSaxpy_v2 hipblasSaxpy
-#define cublasScopy_v2 hipblasScopy
-#define cublasSdot_v2 hipblasSdot
-#define cublasSetStream hipblasSetStream
-#define cublasSgemv_v2 hipblasSgemv
-#define cublasSger_v2 hipblasSger
-#define cublasSnrm2_v2 hipblasSnrm2
-#define cublasSscal_v2 hipblasSscal
-#define cublasSspmv_v2 hipblasSspmv
-#define cublasSspr_v2 hipblasSspr
-#define cublasSsyrk_v2 hipblasSsyrk
-#define cublasStatus_t hipblasStatus_t
-#define cublasStatus_t hipblasStatus_t
-#define cublasStpmv_v2 hipblasStpmv
-#define cublasStrsm_v2(a, b, c, d, e, f, g, h, i, j, k, l) \
-  hipblasStrsm(a, b, c, d, e, f, g, h, const_cast<float*>(i), j, k, l)
-#define cudaComputeModeExclusive hipComputeModeExclusive
-#define cudaComputeModeExclusiveProcess hipComputeModeExclusiveProcess
-#define cudaDataType hipDataType
-#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize
-#define cudaDeviceGetAttribute hipDeviceGetAttribute
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceReset hipDeviceReset
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaErrorDeviceAlreadyInUse hipErrorContextAlreadyInUse
-#define cudaErrorInvalidDevice hipErrorInvalidDevice
-#define cudaError_t hipError_t
-#define cudaEventCreate hipEventCreate
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDestroy hipEventDestroy
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEventSynchronize hipEventSynchronize
-#define cudaEvent_t hipEvent_t
-#define cudaFree hipFree
-#define cudaFreeHost hipFreeHost
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorName hipGetErrorName
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaHostRegister hipHostRegister
-#define cudaHostRegisterDefault hipHostRegisterDefault
-#define cudaHostUnregister hipHostUnregister
-#define cudaLaunchHostFunc hipLaunchHostFunc
-#define cudaMalloc hipMalloc
-#define cudaMallocHost hipHostMalloc
-#define cudaMallocPitch hipMallocPitch
-#define cudaMemcpy hipMemcpy
-// hipMemcpy2DAsync has a disparity to its CUDA counterpart for zero-sized
-// copies, which should be canceled by ROCm 5.7.1+. Then the following would
-// be sufficient:
-// #define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpy2DAsync(a, b, c, d, width, height, e, f)      \
-  [&]() -> hipError_t {                                         \
-    if (width && height)                                        \
-      return hipMemcpy2DAsync(a, b, c, d, width, height, e, f); \
-    return hipSuccess;                                          \
-  }()
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemGetInfo hipMemGetInfo
-#define cudaMemset2DAsync hipMemset2DAsync
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaProfilerStop hipProfilerStop
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreate hipStreamCreate
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamDestroy hipStreamDestroy
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamPerThread ((hipStream_t)2)
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent hipStreamWaitEvent
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define cufftComplex hipfftComplex
-#define cufftDestroy hipfftDestroy
-#define cufftExecR2C hipfftExecR2C
-#define cufftHandle hipfftHandle
-#define cufftPlanMany hipfftPlanMany
-#define cufftSetStream hipfftSetStream
-#define curandCreateGenerator hiprandCreateGenerator
-#define curandDestroyGenerator hiprandDestroyGenerator
-#define curandGenerateNormal hiprandGenerateNormal
-#define curandGenerateNormalDouble hiprandGenerateNormalDouble
-#define curandGenerateUniform hiprandGenerateUniform
-#define curandGenerateUniformDouble hiprandGenerateUniformDouble
-#define curandGenerator_t hiprandGenerator_t
-#define curandSetGeneratorOffset hiprandSetGeneratorOffset
-#define curandSetGeneratorOrdering(x, y) \
-  0  // HIP does not support generator ordeing.
-#define curandSetPseudoRandomGeneratorSeed hiprandSetPseudoRandomGeneratorSeed
-#define curandSetStream hiprandSetStream
-#define curandStatus_t hiprandStatus_t
-#define cusolverDnCreate hipsolverDnCreate
-#define cusolverDnDestroy hipsolverDnDestroy
-#define cusolverDnHandle_t hipsolverDnHandle_t
-#define cusolverDnSetStream hipsolverDnSetStream
-#define cusolverDnSpotrf hipsolverDnSpotrf
-#define cusolverDnSpotrfBatched hipsolverDnSpotrfBatched
-#define cusolverDnSpotrf_bufferSize hipsolverDnSpotrf_bufferSize
-#define cusolverDnSpotrs hipsolverDnSpotrs
-#define cusolverDnSpotrsBatched hipsolverDnSpotrsBatched
-#define cusparseAction_t hipsparseAction_t
-#define cusparseCreate hipsparseCreate
-#define cusparseCreateCsr hipsparseCreateCsr
-#define cusparseCreateDnMat hipsparseCreateDnMat
-#define cusparseCreateMatDescr hipsparseCreateMatDescr
-#define cusparseDcsr2csc hipsparseDcsr2csc
-#define cusparseDestroy hipsparseDestroy
-#define cusparseDestroy hipsparseDestroy
-#define cusparseDestroyDnMat hipsparseDestroyDnMat
-#define cusparseDestroyMatDescr hipsparseDestroyMatDescr
-#define cusparseDestroySpMat hipsparseDestroySpMat
-#define cusparseDnMatDescr_t hipsparseDnMatDescr_t
-#define cusparseGetMatIndexBase hipsparseGetMatIndexBase
-#define cusparseHandle_t hipsparseHandle_t
-#define cusparseIndexBase_t hipsparseIndexBase_t
-#define cusparseMatDescr_t hipsparseMatDescr_t
-#define cusparseOperation_t hipsparseOperation_t
-#define cusparseScsr2csc hipsparseScsr2csc
-#define cusparseSetStream hipsparseSetStream
-#define cusparseSpMM hipsparseSpMM
-#define cusparseSpMM_bufferSize hipsparseSpMM_bufferSize
-#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
-#define cusparseStatus_t hipsparseStatus_t
-#define nvtxRangePop roctxRangePop
-#define nvtxRangePush roctxRangePush
-#define nvtxRangePushA roctxRangePushA
-//
-// HIPCUB namespace.
-//
-#define cub hipcub
-
-//
-// Callback qualifier
-//
-#define CUDART_CB
-
-//
-// Math constants
-//
-#define CUDART_INF HIP_INF
-#define CUDART_INF_F HIP_INF_F
-
-//
-// GPU static hardware characteristics.
-//
-#define GPU_WARP_SIZE 64
-#define GPU_MAX_THREADS_PER_BLOCK 1024
-#define GPU_MAX_WARPS_PER_BLOCK (GPU_MAX_THREADS_PER_BLOCK / GPU_WARP_SIZE)
-#endif  //__HIPIFY_H__
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index bce0c84ad79..860a979a0ce 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -146,7 +146,7 @@ bool PosteriorHolder::Read(std::istream &is) {
   try {
     ReadPosterior(is, is_binary, &t_);
     return true;
-  } catch (const std::exception &e) {
+  } catch (std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
@@ -207,7 +207,7 @@ bool GaussPostHolder::Read(std::istream &is) {
       }
     }
     return true;
-  } catch (const std::exception &e) {
+  } catch (std::exception &e) {
     KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
diff --git a/src/ivector/ivector-extractor-test.cc b/src/ivector/ivector-extractor-test.cc
index ffd5a2561cc..cb08464fbe8 100644
--- a/src/ivector/ivector-extractor-test.cc
+++ b/src/ivector/ivector-extractor-test.cc
@@ -94,10 +94,11 @@ void TestIvectorExtraction(const IvectorExtractor &extractor,
       ivector_dim = extractor.IvectorDim();
   Posterior post(num_frames);
 
+  double tot_log_like = 0.0;
   for (int32 t = 0; t < num_frames; t++) {
     SubVector<BaseFloat> frame(feats, t);
     Vector<BaseFloat> posterior(fgmm.NumGauss(), kUndefined);
-    fgmm.ComponentPosteriors(frame, &posterior);
+    tot_log_like += fgmm.ComponentPosteriors(frame, &posterior);
     for (int32 i = 0; i < posterior.Dim(); i++)
       post[t].push_back(std::make_pair(i, posterior(i)));
   }
diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h
index 938034859e2..a1dc0586a31 100644
--- a/src/ivector/ivector-extractor.h
+++ b/src/ivector/ivector-extractor.h
@@ -137,6 +137,8 @@ class IvectorExtractor {
  public:
   friend class IvectorExtractorStats;
   friend class OnlineIvectorEstimationStats;
+  friend class BatchedIvectorExtractorCuda;
+  friend class IvectorExtractorFastCuda;
 
   IvectorExtractor(): prior_offset_(0.0) { }
 
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index e6819562f82..d1d71ce7a42 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -75,7 +75,7 @@ bool ClusterLattice(CompactLattice *clat,
   unordered_map<StateId, std::vector<Interval> >::iterator iter;
   for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
-    std::sort(iter->second.begin(), iter->second.end(), CompareInterval);
+    sort(iter->second.begin(), iter->second.end(), CompareInterval);
     std::vector<Interval> tmp;
     tmp.push_back(iter->second[0]);
     for (int32 i = 1; i < iter->second.size(); i++) {
@@ -175,7 +175,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 
   // Now we map the CompactLattice to VectorFst<KwsProductArc>. We drop the
   // alignment information and only keep the negated log-probs
-  ArcMap(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
+  Map(clat, factor_transducer, CompactLatticeToKwsProductFstMapper());
 
   // Now do the weight pushing manually on the CompactLattice format. Note that
   // the alphas and betas in Kaldi are stored as the log-probs, not the negated
@@ -366,7 +366,7 @@ void MaybeDoSanityCheck(const KwsProductFst &product_transducer) {
   if (GetVerboseLevel() < 2) return;
   KwsLexicographicFst index_transducer;
 
-  ArcMap(product_transducer,
+  Map(product_transducer,
       &index_transducer,
       KwsProductFstToKwsLexicographicFstMapper());
 
diff --git a/src/kws/kws-functions2.cc b/src/kws/kws-functions2.cc
index 9e610d2054e..71f5583af19 100644
--- a/src/kws/kws-functions2.cc
+++ b/src/kws/kws-functions2.cc
@@ -92,7 +92,7 @@ void DoFactorMerging(KwsProductFst *factor_transducer,
 
   Decode(&dest_transducer, encoder);
 
-  ArcMap(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
+  Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
 }
 
 void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) {
diff --git a/src/kwsbin/generate-proxy-keywords.cc b/src/kwsbin/generate-proxy-keywords.cc
index 253969bae6d..77b49082af4 100644
--- a/src/kwsbin/generate-proxy-keywords.cc
+++ b/src/kwsbin/generate-proxy-keywords.cc
@@ -155,8 +155,8 @@ int main(int argc, char *argv[]) {
       Compose(proxy, *L2xE, &tmp_proxy);
 
       // Processing KxL2xE.
-      KALDI_VLOG(1) << "Project(KxL2xE, PROJECT_OUTPUT)";
-      Project(&tmp_proxy, PROJECT_OUTPUT);
+      KALDI_VLOG(1) << "Project(KxL2xE, fst::ProjectType::OUTPUT)";
+      Project(&tmp_proxy, fst::ProjectType::OUTPUT);
       if (phone_beam >= 0) {
         KALDI_VLOG(1) << "Prune(KxL2xE, " << phone_beam << ")";
         Prune(&tmp_proxy, phone_beam);
@@ -187,8 +187,8 @@ int main(int argc, char *argv[]) {
         ComposeFst<StdArc> lazy_compose(proxy, *L1);
         proxy.DeleteStates();
 
-        KALDI_VLOG(1) << "Project(KxL2xExL1', PROJECT_OUTPUT)";
-        ProjectFst<StdArc> lazy_project(lazy_compose, PROJECT_OUTPUT);
+        KALDI_VLOG(1) << "Project(KxL2xExL1', fst::ProjectType::OUTPUT)";
+        ProjectFst<StdArc> lazy_project(lazy_compose, fst::ProjectType::OUTPUT);
 
         // This will likely be the most time consuming part, we use a special
         // pruning algorithm where we don't expand the full FST.
@@ -200,8 +200,8 @@ int main(int argc, char *argv[]) {
         Compose(proxy, *L1, &tmp_proxy);
         proxy.DeleteStates();
 
-        KALDI_VLOG(1) << "Project(KxL2xExL1', PROJECT_OUTPUT)";
-        Project(&tmp_proxy, PROJECT_OUTPUT);
+        KALDI_VLOG(1) << "Project(KxL2xExL1', fst::ProjectType::OUTPUT)";
+        Project(&tmp_proxy, fst::ProjectType::OUTPUT);
       }
       if (proxy_nbest > 0) {
         KALDI_VLOG(1) << "ShortestPath(KxL2xExL1', " << proxy_nbest << ")";
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index a68c8965588..9466cb725dd 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -25,8 +25,6 @@
 #include "fstext/kaldi-fst-io.h"
 #include "kws/kaldi-kws.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace kaldi {
 
 typedef KwsLexicographicArc Arc;
@@ -289,6 +287,7 @@ int main(int argc, char *argv[]) {
     ArcSort(&index, fst::ILabelCompare<KwsLexicographicArc>());
 
     int32 n_done = 0;
+    int32 n_fail = 0;
     for (; !keyword_reader.Done(); keyword_reader.Next()) {
       std::string key = keyword_reader.Key();
       VectorFst<StdArc> keyword = keyword_reader.Value();
@@ -317,7 +316,7 @@ int main(int argc, char *argv[]) {
                                  &stats_writer);
       }
 
-      Project(&result_fst, PROJECT_OUTPUT);
+      Project(&result_fst, fst::ProjectType::OUTPUT);
       Minimize(&result_fst, (KwsLexicographicFst *) nullptr, kDelta, true);
       ShortestPath(result_fst, &result_fst, n_best);
       RmEpsilon(&result_fst);
@@ -337,6 +336,7 @@ int main(int argc, char *argv[]) {
         if (result_fst.Final(arc.nextstate) != Weight::One()) {
           KALDI_WARN << "The resulting FST does not have "
                      << "the expected structure for key " << key;
+          n_fail++;
           continue;
         }
 
diff --git a/src/kwsbin/transcripts-to-fsts.cc b/src/kwsbin/transcripts-to-fsts.cc
index ecf76edd757..f508ae0b80a 100644
--- a/src/kwsbin/transcripts-to-fsts.cc
+++ b/src/kwsbin/transcripts-to-fsts.cc
@@ -141,11 +141,11 @@ int main(int argc, char *argv[]) {
       }
 
       if (project_input) {
-        Project(&fst, PROJECT_INPUT);
+        Project(&fst, fst::ProjectType::INPUT);
       }
 
       if (project_output) {
-        Project(&fst, PROJECT_OUTPUT);
+        Project(&fst, fst::ProjectType::OUTPUT);
       }
 
       fst_writer.Write(key, fst);
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 39775ac8950..5c0c6d3c416 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -50,7 +50,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
 
   static const std::string &Type() {
     static const std::string type = std::string("arctic") +
-        std::string(FloatWeightTpl<T>::GetPrecisionString());
+        FloatWeightTpl<T>::GetPrecisionString();
     return type;
   }
 
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index 1c16906b090..e7c7977e8cf 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -24,8 +24,6 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace fst {
 // Caution: these tests are not as generic as you might think from all the
 // templates in the code.  They are basically only valid for LatticeArc.
@@ -65,7 +63,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
@@ -78,7 +76,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "FST after lattice-determinizing is:\n";
       {
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
       // OK, now determinize it a different way and check equivalence.
@@ -95,14 +93,14 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       std::cout << "Compact pruned FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
       ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
 
       std::cout << "Compact version of determinized FST is:\n";
       {
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
-        printer_print(std::cout, fstprinter, "standard output");
+        fstprinter.Print(std::cout, "standard output");
       }
 
       if (ans)
@@ -125,14 +123,14 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     std::cout << "FST before lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-      printer_print(std::cout, fstprinter, "standard output");
+      fstprinter.Print(std::cout, "standard output");
     }
     delete fst;
   }
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index ff3d65d57f3..dbdd9af4645 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1499,7 +1499,7 @@ bool DeterminizeLatticePhonePrunedWrapper(
   }
   ILabelCompare<kaldi::LatticeArc> ilabel_comp;
   ArcSort(ifst, ilabel_comp);
-  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
+  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
       trans_model, ifst, beam, ofst, opts);
   Connect(ofst);
   return ans;
@@ -1523,7 +1523,7 @@ bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     DeterminizeLatticePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
     const kaldi::TransitionInformation &trans_model,
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
@@ -1531,7 +1531,7 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
     DeterminizeLatticePhonePrunedOptions opts);
 
 template
-bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, int32>(
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
     const kaldi::TransitionInformation &trans_model,
     MutableFst<kaldi::LatticeArc> *ifst,
     double prune,
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index 49bb8b67459..648e67115b7 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -22,11 +22,8 @@
 #include "lat/kaldi-lattice.h"
 #include "fst/script/print-impl.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace kaldi {
 
-
 /// Converts lattice types if necessary, deleting its input.
 template<class OrigWeightType>
 CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType> *ifst) {
@@ -81,8 +78,7 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-    //printer.Print(&os, "<unknown>");
-    printer_print(os, printer, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
@@ -118,7 +114,7 @@ class LatticeReader {
     CompactLattice *cfst = new CompactLattice();
     string line;
     size_t nline = 0;
-    string separator = FST_FLAGS_fst_field_separator + "\r\n";
+    string separator = FLAGS_fst_field_separator + "\r\n";
     while (std::getline(is, line)) {
       nline++;
       vector<string> col;
@@ -407,8 +403,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-    //printer.Print(&os, "<unknown>");
-    printer_print(os, printer, "<unknown>");
+    printer.Print(os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
     // Write another newline as a terminating character.  The read routine will
diff --git a/src/lat/lattice-functions-transition-model.cc b/src/lat/lattice-functions-transition-model.cc
index a8cd7b7e2dd..6172610dca0 100644
--- a/src/lat/lattice-functions-transition-model.cc
+++ b/src/lat/lattice-functions-transition-model.cc
@@ -248,13 +248,13 @@ bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
   int32 num_paths = 5, seed = Rand(), max_path_length = -1;
   BaseFloat delta = 0.2; // some lattices have large costs -> use large delta.
 
-  FST_FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi
+  FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi
                                // verbose level.
   if (!RandEquivalent(clat, aligned_clat, num_paths, delta, seed, max_path_length)) {
     KALDI_WARN << "Equivalence test failed during lattice alignment.";
     return false;
   }
-  FST_FLAGS_v = 0;
+  FLAGS_v = 0;
 
   return (num_err == 0);
 }
diff --git a/src/lat/minimize-lattice.cc b/src/lat/minimize-lattice.cc
index 416f1e62e93..ada90efadce 100644
--- a/src/lat/minimize-lattice.cc
+++ b/src/lat/minimize-lattice.cc
@@ -279,7 +279,7 @@ bool MinimizeCompactLattice(
 
 // Instantiate for CompactLattice type.
 template
-bool MinimizeCompactLattice<kaldi::LatticeWeight, int32>(
+bool MinimizeCompactLattice<kaldi::LatticeWeight, kaldi::int32>(
     MutableFst<kaldi::CompactLatticeArc> *clat, float delta);
   
 
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index 15915837d56..c2e231d91b1 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -22,8 +22,6 @@
 #include "lat/push-lattice.h"
 #include "fstext/rand-fst.h"
 
-#include "fstext/openfst_compat.h"
-
 
 namespace kaldi {
 using namespace fst;
@@ -94,12 +92,12 @@ void TestPushCompactLatticeWeights() {
       {
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer_print(std::cerr, printer, "<unknown>");
+        printer.Print(std::cerr, "<unknown>");
       }
       {
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
-        printer_print(std::cerr, printer, "<unknown>");
+        printer.Print(std::cerr, "<unknown>");
       }
       KALDI_ERR << "Bad lattice being pushed.";
     }
diff --git a/src/lat/push-lattice.cc b/src/lat/push-lattice.cc
index 38a990d74d3..f4eb322d002 100644
--- a/src/lat/push-lattice.cc
+++ b/src/lat/push-lattice.cc
@@ -280,11 +280,11 @@ bool PushCompactLatticeWeights(
 
 // Instantiate for CompactLattice.
 template
-bool PushCompactLatticeStrings<kaldi::LatticeWeight, int32>(
+bool PushCompactLatticeStrings<kaldi::LatticeWeight, kaldi::int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 template
-bool PushCompactLatticeWeights<kaldi::LatticeWeight, int32>(
+bool PushCompactLatticeWeights<kaldi::LatticeWeight, kaldi::int32>(
    MutableFst<kaldi::CompactLatticeArc> *clat);
 
 }  // namespace fst
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 03b384f93f1..b851bc3604c 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -325,7 +325,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   // paper (i.e. just one final state).
 
   // Topologically sort the lattice, if not already sorted.
-  uint64 props = clat->Properties(fst::kFstProperties, false);
+  kaldi::uint64 props = clat->Properties(fst::kFstProperties, false);
   if (!(props & fst::kTopSorted)) {
     if (fst::TopSort(clat) == false)
       KALDI_ERR << "Cycles detected in lattice.";
diff --git a/src/lat/word-align-lattice-lexicon.cc b/src/lat/word-align-lattice-lexicon.cc
index 5cd75078cb9..3726ca8c5fc 100644
--- a/src/lat/word-align-lattice-lexicon.cc
+++ b/src/lat/word-align-lattice-lexicon.cc
@@ -311,7 +311,7 @@ class LatticeLexiconWordAligner {
     std::vector<int32> syms_to_remove;
     syms_to_remove.push_back(kTemporaryEpsilon);
     RemoveSomeInputSymbols(syms_to_remove, lat_out_);
-    Project(lat_out_, fst::PROJECT_INPUT);
+    Project(lat_out_, fst::ProjectType::INPUT);
   }
 
   bool AlignLattice() {
diff --git a/src/lat/word-align-lattice.cc b/src/lat/word-align-lattice.cc
index d644709cb2a..ff879914b4e 100644
--- a/src/lat/word-align-lattice.cc
+++ b/src/lat/word-align-lattice.cc
@@ -102,7 +102,7 @@ class LatticeWordAligner {
     void OutputArcForce(const WordBoundaryInfo &info,
                         const TransitionInformation &tmodel,
                         CompactLatticeArc *arc_out,
-                        bool *error);
+                        bool *error, bool allow_partial);
 
     size_t Hash() const {
       VectorHasher<int32> vh;
@@ -184,7 +184,7 @@ class LatticeWordAligner {
       // have returned false or we wouldn't have been called, so we have to
       // force it out.
       CompactLatticeArc lat_arc;
-      tuple.comp_state.OutputArcForce(info_, tmodel_, &lat_arc, &error_);
+      tuple.comp_state.OutputArcForce(info_, tmodel_, &lat_arc, &error_, allow_partial_);
       // True in the next line means add it to the queue.
       lat_arc.nextstate = GetStateForTuple(tuple, true);
       // The final-prob stuff will get called again from ProcessQueueElement().
@@ -299,7 +299,7 @@ class LatticeWordAligner {
       syms_to_remove.push_back(info_.silence_label);
     if (!syms_to_remove.empty()) {
       RemoveSomeInputSymbols(syms_to_remove, lat_out_);
-      Project(lat_out_, fst::PROJECT_INPUT);
+      Project(lat_out_, fst::ProjectType::INPUT);
     }
   }
 
@@ -330,6 +330,10 @@ class LatticeWordAligner {
     return !error_;
   }
 
+  void AllowPartial(bool allow) {
+    allow_partial_ = allow;
+  }
+
   CompactLattice lat_;
   const TransitionInformation &tmodel_;
   const WordBoundaryInfo &info_in_;
@@ -343,7 +347,7 @@ class LatticeWordAligner {
 
   MapType map_; // map from tuples to StateId.
   bool error_;
-
+  bool allow_partial_;
 };
 
 bool LatticeWordAligner::ComputationState::OutputSilenceArc(
@@ -563,7 +567,7 @@ static bool IsPlausibleWord(const WordBoundaryInfo &info,
 
 void LatticeWordAligner::ComputationState::OutputArcForce(
     const WordBoundaryInfo &info, const TransitionInformation &tmodel,
-    CompactLatticeArc *arc_out,  bool *error) {
+    CompactLatticeArc *arc_out,  bool *error, bool allow_partial) {
 
   KALDI_ASSERT(!IsEmpty());
   if (!word_labels_.empty()
@@ -572,7 +576,7 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
     // and failed, so this means we didn't see the end of that
     // word.
     int32 word = word_labels_[0];
-    if (! *error && !IsPlausibleWord(info, tmodel, transition_ids_)) {
+    if (!allow_partial && ! *error && !IsPlausibleWord(info, tmodel, transition_ids_)) {
       *error = true;
       KALDI_WARN << "Invalid word at end of lattice [partial lattice, forced out?]";
     }
@@ -626,13 +630,17 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
       *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
                                    cw, fst::kNoStateId);
     } else {
+
       // Not silence phone -- treat as partial word (with no word label).
       // This is in itself an error condition, i.e. the lattice was maybe
       // forced out.
-      if (! *error) {
+      // In many cases it is not really a error, we just want to
+      // word-align partial lattice
+      if (!allow_partial && ! *error) {
         *error = true;
         KALDI_WARN << "Partial word detected at end of utterance";
       }
+
       CompactLatticeWeight cw(weight_, transition_ids_);
       *arc_out = CompactLatticeArc(info.partial_word_label, info.partial_word_label,
                                    cw, fst::kNoStateId);
@@ -728,7 +736,15 @@ bool WordAlignLattice(const CompactLattice &lat,
   return aligner.AlignLattice();
 }
 
-
+bool WordAlignLatticePartial(const CompactLattice &lat,
+                      const TransitionInformation &tmodel,
+                      const WordBoundaryInfo &info,
+                      int32 max_states,
+                      CompactLattice *lat_out) {
+  LatticeWordAligner aligner(lat, tmodel, info, max_states, lat_out);
+  aligner.AllowPartial(true);
+  return aligner.AlignLattice();
+}
 
 class WordAlignedLatticeTester {
  public:
@@ -890,7 +906,7 @@ class WordAlignedLatticeTester {
       std::vector<int32> to_remove;
       to_remove.push_back(info_.silence_label);
       RemoveSomeInputSymbols(to_remove, &aligned_lat);
-      Project(&aligned_lat, fst::PROJECT_INPUT);
+      Project(&aligned_lat, fst::ProjectType::INPUT);
     }
 
     if (!RandEquivalent(lat_, aligned_lat, 5/*paths*/, 1.0e+10/*delta*/, Rand()/*seed*/,
diff --git a/src/lat/word-align-lattice.h b/src/lat/word-align-lattice.h
index e688e3e4828..48fb3bfca32 100644
--- a/src/lat/word-align-lattice.h
+++ b/src/lat/word-align-lattice.h
@@ -194,7 +194,11 @@ bool WordAlignLattice(const CompactLattice &lat,
                       int32 max_states,
                       CompactLattice *lat_out);
 
-
+bool WordAlignLatticePartial(const CompactLattice &lat,
+                      const TransitionInformation &tmodel,
+                      const WordBoundaryInfo &info,
+                      int32 max_states,
+                      CompactLattice *lat_out);
 
 /// This function is designed to crash if something went wrong with the
 /// word-alignment of the lattice.  It verifies
diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc
index 41e1b32658f..b0cd9b433b9 100644
--- a/src/latbin/lattice-interp.cc
+++ b/src/latbin/lattice-interp.cc
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
 
         Lattice lat2;
         ConvertLattice(clat2, &lat2);
-        fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words.
+        fst::Project(&lat2, fst::ProjectType::OUTPUT); // project on words.
         ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha), &lat2);
         ArcSort(&lat2, fst::ILabelCompare<LatticeArc>());
 
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index f6ce790d51d..7fe86e64939 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -25,8 +25,6 @@
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 
-#include "fstext/openfst_compat.h"
-
 namespace kaldi {
 
  using std::string;
@@ -69,7 +67,7 @@ void ConvertLatticeToUnweightedAcceptor(const kaldi::Lattice &ilat,
   fst::ConvertLattice(ilat, ofst);
   // remove weights, project to output, sort according to input arg
   fst::Map(ofst, fst::RmWeightMapper<fst::StdArc>());
-  fst::Project(ofst, fst::PROJECT_OUTPUT);  // The words are on the output side
+  fst::Project(ofst, fst::ProjectType::OUTPUT);  // The words are on the output side
   fst::Relabel(ofst, wildcards, wildcards);
   fst::RmEpsilon(ofst);   // Don't tolerate epsilons as they make it hard to
                           // tally errors
@@ -259,7 +257,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 n_done = 0, n_fail = 0;
-    int32 tot_substitutions = 0,
+    int32 tot_correct = 0, tot_substitutions = 0,
           tot_insertions = 0, tot_deletions = 0, tot_words = 0;
 
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
@@ -322,6 +320,7 @@ int main(int argc, char *argv[]) {
         KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
                   << " / " << num_words << ", " << insertions << " insertions, "
                   << deletions << " deletions, " << substitutions << " sub ]";
+        tot_correct += correct;
         tot_substitutions += substitutions;
         tot_insertions += insertions;
         tot_deletions += deletions;
@@ -367,7 +366,7 @@ int main(int argc, char *argv[]) {
           fst::ArcSort(&clat, fst::ILabelCompare<CompactLatticeArc>());
           fst::Compose(oracle_clat_mask, clat, &oracle_clat_mask);
           fst::ShortestPath(oracle_clat_mask, &oracle_clat);
-          fst::Project(&oracle_clat, fst::PROJECT_OUTPUT);
+          fst::Project(&oracle_clat, fst::ProjectType::OUTPUT);
           TopSortCompactLatticeIfNeeded(&oracle_clat);
 
           if (oracle_clat.Start() == fst::kNoStateId) {
diff --git a/src/latbin/lattice-project.cc b/src/latbin/lattice-project.cc
index b74ab177594..d865a53e443 100644
--- a/src/latbin/lattice-project.cc
+++ b/src/latbin/lattice-project.cc
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
         RemoveAlignmentsFromCompactLattice(&clat);
         Lattice lat;
         ConvertLattice(clat, &lat);
-        fst::Project(&lat, fst::PROJECT_OUTPUT); // project on words.        
+        fst::Project(&lat, fst::ProjectType::OUTPUT); // project on words.        
         lattice_writer.Write(key, lat);
         n_done++;
       }
@@ -78,7 +78,7 @@ int main(int argc, char *argv[]) {
         std::string key = lattice_reader.Key();
         Lattice lat = lattice_reader.Value();
         lattice_reader.FreeCurrent();
-        fst::Project(&lat, fst::PROJECT_INPUT);
+        fst::Project(&lat, fst::ProjectType::INPUT);
         lattice_writer.Write(key, lat);
         n_done++;
       }
diff --git a/src/latbin/lattice-prune.cc b/src/latbin/lattice-prune.cc
index d87f5ded28f..49399f748e4 100644
--- a/src/latbin/lattice-prune.cc
+++ b/src/latbin/lattice-prune.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
     CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
 
-    int32 n_done = 0;
+    int32 n_done = 0, n_err = 0;
     int64 n_arcs_in = 0, n_arcs_out = 0,
         n_states_in = 0, n_states_out = 0;
 
@@ -86,6 +86,7 @@ int main(int argc, char *argv[]) {
       CompactLattice pruned_clat(clat);
       if (!PruneLattice(beam, &pruned_clat)) {
         KALDI_WARN << "Error pruning lattice for utterance " << key;
+        n_err++;
       }
       int64 pruned_narcs = NumArcs(pruned_clat),          
           pruned_nstates = pruned_clat.NumStates();
diff --git a/src/latbin/lattice-to-fst.cc b/src/latbin/lattice-to-fst.cc
index 0d2ac29a99b..39a9ec97f38 100644
--- a/src/latbin/lattice-to-fst.cc
+++ b/src/latbin/lattice-to-fst.cc
@@ -78,7 +78,7 @@ int main(int argc, char *argv[]) {
         // extra states because already removed alignments.
         ConvertLattice(lat, &fst); // this adds up the (lm,acoustic) costs to get
         // the normal (tropical) costs.
-        Project(&fst, fst::PROJECT_OUTPUT); // Because in the standard Lattice format,
+        Project(&fst, fst::ProjectType::OUTPUT); // Because in the standard Lattice format,
         // the words are on the output, and we want the word labels.
       }
       if (rm_eps) RemoveEpsLocal(&fst);
diff --git a/src/latbin/lattice-to-mpe-post.cc b/src/latbin/lattice-to-mpe-post.cc
index 771399a32a4..7961cc5c438 100644
--- a/src/latbin/lattice-to-mpe-post.cc
+++ b/src/latbin/lattice-to-mpe-post.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0;
+    int32 num_done = 0, num_err = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -114,6 +114,7 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
+        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/latbin/lattice-to-smbr-post.cc b/src/latbin/lattice-to-smbr-post.cc
index 6b2861b395f..e2772316954 100644
--- a/src/latbin/lattice-to-smbr-post.cc
+++ b/src/latbin/lattice-to-smbr-post.cc
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
     }
 
-    int32 num_done = 0;
+    int32 num_done = 0, num_err = 0;
     double total_lat_frame_acc = 0.0, lat_frame_acc;
     double total_time = 0, lat_time;
 
@@ -115,6 +115,7 @@ int main(int argc, char *argv[]) {
       
       if (!alignments_reader.HasKey(key)) {
         KALDI_WARN << "No alignment for utterance " << key;
+        num_err++;
       } else {
         const std::vector<int32> &alignment = alignments_reader.Value(key);
         Posterior post;
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
index a84258edab8..ec550c6caa2 100644
--- a/src/lm/arpa-lm-compiler-test.cc
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -60,12 +60,12 @@ static fst::StdVectorFst* CreateGenFst(bool seps, const fst::SymbolTable* pst) {
   }
 
   // Add a loop for each symbol in the table except the four special ones.
-  fst::SymbolTableIterator si(*pst);
-  for (si.Reset(); !si.Done(); si.Next()) {
-    if (si.Value() == kBos || si.Value() == kEos ||
-        si.Value() == kEps || si.Value() == kDisambig)
+
+  for (fst::SymbolTable::iterator si = pst->begin(); si != pst->end(); ++si) {
+    if (si->Label() == kBos || si->Label() == kEos ||
+        si->Label() == kEps || si->Label() == kDisambig)
       continue;
-    genFst->AddArc(midId, fst::StdArc(si.Value(), si.Value(),
+    genFst->AddArc(midId, fst::StdArc(si->Label(), si->Label(),
                                       fst::StdArc::Weight::One(), midId));
   }
   return genFst;
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index bc54636f59b..4310e47f68c 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -25,10 +25,9 @@ $(error Android build does not support compiling with $(CXX).
         Supported compilers: clang++)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
            -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=softfp \
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index 713d8982f9e..8785371b3e1 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,10 +5,10 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CXXFLAGS += -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER) -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT) -I.. -isystem $(OPENFSTINC)
-CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA -DOPENFST_VER=$(OPENFSTVER)\
+CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA \
              -ccbin $(lastword $(CXX)) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
              -std=c++14 -DCUDA_API_PER_THREAD_DEFAULT_STREAM -lineinfo \
              --verbose -Wno-deprecated-gpu-targets
@@ -16,4 +16,4 @@ CUDA_FLAGS = --compiler-options -fPIC --machine 64 -DHAVE_CUDA -DOPENFST_VER=$(O
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64/stubs -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib/stubs -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
 
-CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft
+CUDA_LDLIBS += -lcuda -lcublas -lcusparse -lcusolver -lcudart -lcurand -lcufft -lnvToolsExt
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index fe04a2f1898..7a77a19453b 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,10 +10,9 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
            -msse -msse2 -O -Wa,-mbig-obj \
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 8d8bd47cf2c..7da1dda47ba 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -10,10 +10,9 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/darwin_arm64.mk b/src/makefiles/darwin_arm64.mk
deleted file mode 100644
index fd571f1db72..00000000000
--- a/src/makefiles/darwin_arm64.mk
+++ /dev/null
@@ -1,37 +0,0 @@
-# Darwin (macOS) configuration
-
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-ifndef OPENFSTINC
-$(error OPENFSTINC not defined.)
-endif
-ifndef OPENFSTLIBS
-$(error OPENFSTLIBS not defined.)
-endif
-
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
-           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
-           -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
-           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
-           -pthread \
-           -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-# Compiler specific flags
-COMPILER = $(shell $(CXX) -v 2>&1)
-ifeq ($(findstring clang,$(COMPILER)),clang)
-# Suppress annoying clang warnings that are perfectly valid per spec.
-CXXFLAGS += -Wno-mismatched-tags
-else ifeq ($(findstring GCC,$(COMPILER)),GCC)
-# Allow implicit conversions between vectors.
-CXXFLAGS += -flax-vector-conversions
-endif
-
-LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
diff --git a/src/makefiles/darwin_clapack.mk b/src/makefiles/darwin_clapack.mk
index 983efbb77a0..6c262af1e86 100644
--- a/src/makefiles/darwin_clapack.mk
+++ b/src/makefiles/darwin_clapack.mk
@@ -17,10 +17,9 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 21a3b053639..7f0933ac314 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -53,6 +53,10 @@ $(LIBFILE): $(LIBNAME).a
   ifeq ($(shell uname), Darwin)
 	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS)
 	ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@
+  else ifeq ($(LLVM_BUILD), 1)
+        # Building shared library from static (static was compiled with -fPIC) without soname
+	$(CXX) -shared -o $@ -Wl,--as-needed  -Wl,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive $(LDFLAGS) $(LDLIBS)
+	ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@
   else ifeq ($(shell uname), Linux)
         # Building shared library from static (static was compiled with -fPIC)
 	$(CXX) -shared -o $@ -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive $(LDFLAGS) $(LDLIBS)
@@ -145,17 +149,12 @@ ifneq ($(CC_SRCS),)
 CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
 endif
 
-ifeq ($(IS_GPU_BUILD), true)
+ifeq ($(CUDA), true)
 CUDA_SRCS=$(wildcard *.cu)
 # Check if any CUDA .cu sources exist to run dependency commands on.
 ifneq ($(CUDA_SRCS),)
-ifeq ($(CUDA), true)
 NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS)
 endif
-ifeq ($(ROCM), true)
-HIPCC_DEP_COMMAND = $(HIPCC) -M $(ROCM_FLAGS) $(ROCM_INCLUDE) $(CUDA_SRCS)
-endif
-endif
 endif
 
 .PHONY: depend
@@ -167,9 +166,6 @@ endif
 ifneq ($(NVCC_DEP_COMMAND),)
 	-$(NVCC_DEP_COMMAND) >> .depend.mk
 endif
-ifneq ($(HIPCC_DEP_COMMAND),)
-	-$(HIPCC_DEP_COMMAND) >> .depend.mk
-endif
 
 # removing automatic making of "depend" as it's quite slow.
 #.depend.mk: depend
diff --git a/src/makefiles/hip_64bit.mk b/src/makefiles/hip_64bit.mk
deleted file mode 100644
index 3d9b87dcc03..00000000000
--- a/src/makefiles/hip_64bit.mk
+++ /dev/null
@@ -1,53 +0,0 @@
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-ifndef ROCMDIR
-$(error ROCMDIR not defined.)
-endif
-
-# Uncomment if willing to use ROCTX capabilities.
-# ROCM_USEROCTX = -DUSE_NVTX
-
-# Specific HIP/ROCm components should be included prior to the generic include to avoid
-# deprecation warnings.
-CXXFLAGS += $(ROCM_USEROCTX) -DHAVE_CUDA=1 \
-            -D__HIP_PLATFORM_AMD__=1 \
-            -D__IS_HIP_COMPILE__=1 \
-            -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
-            -DCUDA_VERSION=11000 \
-	          -I$(ROCMDIR)/hipsparse/include \
-	          -I$(ROCMDIR)/hipfft/include \
-	          -I$(ROCMDIR)/hipblas/include \
-	          -I$(ROCMDIR)/hiprand/include \
-	          -I$(ROCMDIR)/rocrand/include \
-	          -I$(ROCMDIR)/include \
-	          -I.. -I../hip -fPIC -pthread -isystem $(OPENFSTINC)
-
-ROCM_INCLUDE = -I$(ROCMDIR)/hipsparse/include \
-               -I$(ROCMDIR)/hipfft/include \
-               -I$(ROCMDIR)/hipblas/include \
-               -I$(ROCMDIR)/hiprand/include \
-               -I$(ROCMDIR)/rocrand/include \
-               -I$(ROCMDIR)/include \
-               -I.. -I../hip -isystem $(OPENFSTINC)
-               
-# TODO: Consider passing __CUDA_ARCH__=800 here as it is mostly supported by ROCm.
-#       However this macro has some side effect with HIPCC that makes it assume
-#       CUDA is active and everything is device compiles.
-ROCM_FLAGS = $(ROCM_USEROCTX) -fPIC -DHAVE_CUDA=1 \
-             -D__IS_HIP_COMPILE__=1 \
-             -D__HIP_PLATFORM_AMD__=1 \
-             -DROCM_MAJOR_VERSION=$(ROCM_MAJOR_VERSION) -DROCM_MINOR_VERSION=$(ROCM_MINOR_VERSION) \
-             -D__CUDACC_VER_MAJOR__=11 -DCUDA_VERSION=11000 \
-	         -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -std=c++14 -munsafe-fp-atomics  \
-             -fgpu-default-stream=per-thread \
-             $(EXTRA_ROCM_FLAGS)
-             
-
-# TODO: Consider use ROCM_LDFLAGS/ROCM_LDLIBS or generic GPU_LDFLAGS/GPU_LDLIBS in the makefiles.
-# We allow the libraries we link against to have undefined symbols so as this can be build in
-# systems with no development version of these libraries (e.g. ncurses).
-CUDA_LDFLAGS += -L$(ROCMDIR)/lib -Wl,-rpath,$(ROCMDIR)/lib
-CUDA_LDLIBS += -lhipblas -lhipsparse -lhipsolver -lhiprand -lhipfft -lroctx64 -lamdhip64 -Wl,--allow-shlib-undefined 
-LDFLAGS += $(CUDA_LDFLAGS)
-LDLIBS += $(CUDA_LDLIBS)
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 576b608b88b..07264b79e2c 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -19,10 +19,9 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -msse -msse2 -pthread \
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 4e38ac20b66..5df8ac22965 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -16,10 +16,9 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index fbd8bfad9f3..cdd58293ca2 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -19,10 +19,9 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 982e293dbe4..e6f4f541d0b 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -17,10 +17,9 @@ CLAPACKLIBS = $(CLAPACKROOT)/CLAPACK-3.2.1/lapack.a $(CLAPACKROOT)/CLAPACK-3.2.1
 	      $(CLAPACKROOT)/CBLAS/lib/cblas.a \
 	      $(CLAPACKROOT)/f2c_BLAS-3.8.0/blas.a $(CLAPACKROOT)/libf2c/libf2c.a
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 \
@@ -44,11 +43,10 @@ ifeq ($(findstring clang,$(COMPILER)),clang)
 CXXFLAGS += -Wno-mismatched-tags
 endif
 
-LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS)
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(CLAPACKLIBS) -lm -ldl
 
 ifneq ($(ARCH), WASM)
-    LDFLAGS += -rdynamic
     CXXFLAGS += -pthread
     LDLIBS += -lpthread
 endif
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index b22a81ce382..04acf18f274 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -13,10 +13,9 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 5af38a7ba14..1ac67c355ff 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -19,13 +19,12 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
-           -msse -msse2 \
+           -msse -msse2 -pthread \
            -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
@@ -46,13 +45,5 @@ ifeq ($(findstring clang,$(COMPILER)),clang)
 CXXFLAGS += -Wno-mismatched-tags
 endif
 
-LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -ldl
-
-ifneq ($(ARCH), WASM)
-  CXXFLAGS += -pthread
-  LDLIBS += -lpthread
-  LDFLAGS += -rdynamic
-else 
-  CXXFLAGS += -DKALDI_WASM
-endif
\ No newline at end of file
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_aarch64.mk b/src/makefiles/linux_openblas_aarch64.mk
index 4a230ca555e..d81990aed7a 100644
--- a/src/makefiles/linux_openblas_aarch64.mk
+++ b/src/makefiles/linux_openblas_aarch64.mk
@@ -19,10 +19,9 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -pthread \
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 9847913e5df..1b439cf141c 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -19,10 +19,9 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 21bb69b88df..4f74eea5637 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -19,10 +19,9 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
diff --git a/src/makefiles/linux_openblas_riscv64.mk b/src/makefiles/linux_openblas_riscv64.mk
new file mode 100644
index 00000000000..d81990aed7a
--- /dev/null
+++ b/src/makefiles/linux_openblas_riscv64.mk
@@ -0,0 +1,49 @@
+# OpenBLAS specific Linux ARM configuration
+
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
+endif
+ifndef OPENBLASLIBS
+$(error OPENBLASLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -ftree-vectorize -pthread \
+           -g
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 2b5fd3be654..2489938a18a 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -17,10 +17,9 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=$(CXXLANGVERSION) -I.. -isystem $(OPENFSTINC) -O1 \
+CXXFLAGS = -std=c++17 -I.. -isystem $(OPENFSTINC) -O1 \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
-           -DOPENFST_VER=$(OPENFSTVER) \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL $(MKL_CXXFLAGS) \
            -m64 -msse -msse2 -pthread -g
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index b44bf1d934f..064edf4237b 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -1006,11 +1006,11 @@ class SubMatrix : public MatrixBase<Real> {
             MatrixIndexT num_cols,
             MatrixIndexT stride);
 
-  ~SubMatrix() {}
+  ~SubMatrix<Real>() {}
 
   /// This type of constructor is needed for Range() to work [in Matrix base
   /// class]. Cannot make it explicit.
-  SubMatrix(const SubMatrix &other):
+  SubMatrix<Real> (const SubMatrix &other):
   MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
                     other.stride_) {}
 
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 6942b220da6..496c09f5344 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -669,10 +669,12 @@ void ComputePca(const MatrixBase<Real> &X,
       Nsp.TopEigs(&l, &Vtmp);
     }
 
+    MatrixIndexT num_zeroed = 0;
     for (MatrixIndexT g = 0; g < G; g++) {
       if (l(g) < 0.0) {
         KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
         l(g) = 0.0;
+        num_zeroed++;
       }
     }
     SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
diff --git a/src/matrix/qr.cc b/src/matrix/qr.cc
index db1b7359de9..861dead05ba 100644
--- a/src/matrix/qr.cc
+++ b/src/matrix/qr.cc
@@ -57,7 +57,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
     if (max_x == 0.0) max_x = 1.0;
     s = 1.0 / max_x;
   }
-
+  
   Real sigma = 0.0;
   v[0] = 1.0;
   for (MatrixIndexT i = 1; i < dim; i++) {
@@ -73,7 +73,7 @@ void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
       v[0] = x1 - mu;
     } else {
       v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
+      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
     }
     Real v1 = v[0];
     Real v1sq = v1 * v1;
@@ -155,11 +155,11 @@ void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
    with packed lower-triangular matrices to do it this way.  There's also
    a shift from one-based to zero-based indexing, so the index
    k is transformed k -> n - k, and a corresponding transpose...
-
+   
    Let the original *this be A.  This algorithms replaces *this with
    a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
    Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q.
+   If Q != NULL it outputs Q. 
 */
 template<typename Real>
 void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
@@ -195,7 +195,7 @@ void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
     if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
       // case we apply them in the opposite order so it's H_n-1 .. H_1,
       // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.
+      // It's a double negative.    
       // Anyway, we left-multiply Q by each one.  The H_n would each be
       // diag(I + beta v v', I) but we don't ever touch the last dims.
       // We do (in Matlab notation):
@@ -309,7 +309,7 @@ void QrStep(MatrixIndexT n,
     if (k < n-2) {
       // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
       // backwards.
-      Real &elem_kp2_k = z,
+      Real &elem_kp2_k = z, 
           &elem_kp2_kp1 = off_diag[k+1];
       // Note: elem_kp2_k == z would start off as zero because it's
        // two off the diagonal, and not been touched yet.  Therefore
@@ -338,7 +338,7 @@ void QrInternal(MatrixIndexT n,
   MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
       large_iters = 100 + 2*n;
   Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-
+  
   for (; counter < max_iters; counter++) { // this takes the place of "until
                                            // q=n"... we'll break out of the
                                            // loop when we converge.
@@ -356,7 +356,7 @@ void QrInternal(MatrixIndexT n,
         off_diag[i] = 0.0;
     }
     // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we
+    // For the definitions of q and p, see Golub and Van Loan; we 
     // partition the n dims into pieces of size (p, n-p-q, q) where
     // the part of size q is diagonal and the part of size n-p-p is
     // "unreduced", i.e. has no zero off-diagonal elements.
@@ -392,7 +392,7 @@ void QrInternal(MatrixIndexT n,
     } else {
       QrStep(npq, diag + p, off_diag + p,
              static_cast<MatrixBase<Real>*>(NULL));
-    }
+    }      
   }
   if (counter == max_iters) {
     KALDI_WARN << "Failure to converge in QR algorithm. "
@@ -490,7 +490,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     r.AddSpVec(1.0, S, Q.Row(d), 0.0);
     // r = S * q_d
     MatrixIndexT counter = 0;
-    Real end_prod = 0;
+    Real end_prod;
     while (1) { // Normally we'll do this loop only once:
       // we repeat to handle cases where r gets very much smaller
       // and we want to orthogonalize again.
@@ -528,11 +528,11 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
     }
   }
 
-  Matrix<Real> R(lanczos_dim, lanczos_dim);
+  Matrix<Real> R(lanczos_dim, lanczos_dim);  
   R.SetUnit();
   T.Qr(&R); // Diagonalizes T.
   Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);
+  s_tmp.CopyDiagFromSp(T);  
 
   // Now T = R * diag(s_tmp) * R^T.
   // The next call sorts the elements of s from greatest to least absolute value,
@@ -544,7 +544,7 @@ void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
   SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
   SubVector<Real> s_sub(s_tmp, 0, eig_dim);
   s->CopyFromVec(s_sub);
-
+      
   // For working out what to do now, just assume the other eigenvalues were
   // zero.  This is just for purposes of knowing how to get the result, and
   // not getting things wrongly transposed.
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
index 16d34160508..65c48097bf9 100644
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ b/src/nnet2/nnet-compute-discriminative.cc
@@ -296,7 +296,7 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
 
   ScalePosterior(eg_.weight, &post);
 
-  double tot_num_post = 0.0;
+  double tot_num_post = 0.0, tot_den_post = 0.0;
   std::vector<MatrixElement<BaseFloat> > sv_labels;
   sv_labels.reserve(answers.size());
   for (int32 t = 0; t < post.size(); t++) {
@@ -304,6 +304,7 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
       int32 pdf_id = post[t][i].first;
       BaseFloat weight = post[t][i].second;
       if (weight > 0.0) { tot_num_post += weight; }
+      else { tot_den_post -= weight; }
       MatrixElement<BaseFloat> elem = {t, pdf_id, weight};
       sv_labels.push_back(elem);
     }
diff --git a/src/nnet2/nnet-example-functions.cc b/src/nnet2/nnet-example-functions.cc
index 47e039d3341..915c3262ec5 100644
--- a/src/nnet2/nnet-example-functions.cc
+++ b/src/nnet2/nnet-example-functions.cc
@@ -264,7 +264,7 @@ void DiscriminativeExampleSplitter::CollapseTransitionIds() {
 void DiscriminativeExampleSplitter::PrepareLattice(bool first_time) {
   ::fst::ConvertLattice(eg_.den_lat, &lat_);
 
-  Project(&lat_, fst::PROJECT_INPUT); // Get rid of the word labels and put the
+  Project(&lat_, fst::ProjectType::INPUT); // Get rid of the word labels and put the
                                       // transition-ids on both sides.
   
   RmEpsilon(&lat_); // Remove epsilons.. this simplifies
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index b6c75ac7118..0bf1bebe096 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -3,14 +3,9 @@ all:
 
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
-endif
-ifeq ($(ROCM), true)
-LDFLAGS += $(ROCM_LDFLAGS)
-LDLIBS += $(ROCM_LDLIBS)
-endif
+
 
 TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-descriptor-test nnet-parse-test nnet-component-test \
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 0f8f8a4aef7..5431382c4cd 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -326,7 +326,7 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
 
   // Get rid of the word labels and put the
   // transition-ids on both sides.
-  fst::Project(out_lat, fst::PROJECT_INPUT);
+  fst::Project(out_lat, fst::ProjectType::INPUT);
   fst::RmEpsilon(out_lat);
 
   if (config_.collapse_transition_ids)
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
index fd84c4e56fe..0e07834ed3d 100644
--- a/src/nnet3/nnet-batch-compute.cc
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -1503,7 +1503,7 @@ NnetBatchDecoder::~NnetBatchDecoder() {
   }
   // Print diagnostics.
 
-  int64 input_frame_count =
+  kaldi::int64 input_frame_count =
       frame_count_ * computer_->GetOptions().frame_subsampling_factor;
   int32 num_threads = static_cast<int32>(decode_threads_.size());
 
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 697aceea07d..d7f3e9f12ba 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -342,8 +342,8 @@ void CompileLooped(const Nnet &nnet,
     if (CompileLoopedInternal(nnet, optimize_opts,
                              request1, request2, request3,
                              num_requests, computation)) {
-      KALDI_LOG << "Spent " << timer.Elapsed()
-                << " seconds in looped compilation.";
+      KALDI_VLOG(2) << "Spent " << timer.Elapsed()
+                   << " seconds in looped compilation.";
       return;
     } else {
       KALDI_VLOG(2) << "Looped compilation failed with "
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 06278610553..facbbb19be0 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -673,9 +673,11 @@ void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) con
         vec.push_back(config_.num_frames[i]);
       if (j > 0)
         vec.push_back(config_.num_frames[j]);
+      int32 n = 0;
       while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
         if (!vec.empty()) // Don't allow the empty vector as a split.
           splits_set.insert(vec);
+        n++;
         vec.push_back(primary_length);
         std::sort(vec.begin(), vec.end());
       }
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 2bd23273982..039fc258b13 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -3,14 +3,8 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-ifeq ($(CUDA), true)
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
-endif
-ifeq ($(ROCM), true)
-LDFLAGS += $(ROCM_LDFLAGS)
-LDLIBS += $(ROCM_LDLIBS)
-endif
 
 BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
diff --git a/src/online/online-tcp-source.cc b/src/online/online-tcp-source.cc
index 8421073d559..6d63493b4bd 100644
--- a/src/online/online-tcp-source.cc
+++ b/src/online/online-tcp-source.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-typedef int32 int32;
+typedef kaldi::int32 int32;
 
 OnlineTcpVectorSource::OnlineTcpVectorSource(int32 socket)
     : socket_desc(socket),
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index 4af8bc5149e..0f55da86f21 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -78,6 +78,12 @@ void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
       trans_model_, &raw_lat, lat_beam, clat, decoder_opts_.det_opts);
 }
 
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::GetRawLattice(bool end_of_utterance,
+                                                        Lattice *lat) const {
+  decoder_.GetRawLattice(lat, end_of_utterance);
+}
+
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::GetBestPath(bool end_of_utterance,
                                               Lattice *best_path) const {
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 9adf77fcb56..94a4183cbed 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -84,6 +84,10 @@ class SingleUtteranceNnet3DecoderTpl {
   void GetLattice(bool end_of_utterance,
                   CompactLattice *clat) const;
 
+  // Extra
+  void GetRawLattice(bool end_of_utterance,
+                     Lattice *lat) const;
+
   /// Outputs an FST corresponding to the single best path through the current
   /// lattice. If "use_final_probs" is true AND we reached the final-state of
   /// the graph then it will include those as final-probs, else it will treat
@@ -98,6 +102,8 @@ class SingleUtteranceNnet3DecoderTpl {
 
   const LatticeFasterOnlineDecoderTpl<FST> &Decoder() const { return decoder_; }
 
+  nnet3::DecodableAmNnetLoopedOnline &Decodable() { return decodable_; }
+
   ~SingleUtteranceNnet3DecoderTpl() { }
  private:
 
diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc
index 615941f760a..06157d0fcdf 100644
--- a/src/online2bin/apply-cmvn-online.cc
+++ b/src/online2bin/apply-cmvn-online.cc
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
     
     
     BaseFloatMatrixWriter feature_writer(feature_wspecifier);
-    int32 num_done = 0;
+    int32 num_done = 0, num_err = 0;
     int64 tot_t = 0;
 
     if (spk2utt_rspecifier != "") {
@@ -82,6 +82,7 @@ int main(int argc, char *argv[]) {
           std::string utt = uttlist[i];
           if (!feature_reader.HasKey(utt)) {
             KALDI_WARN << "No features for utterance " << utt;
+            num_err++;
             continue;
           }
           const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index eafc0e64124..e697de6d15a 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
 
-    double tot_objf_impr = 0.0, tot_t = 0.0,
+    double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
 
@@ -166,6 +166,7 @@ int main(int argc, char *argv[]) {
         }
         // Update diagnostics.
 
+        tot_ubm_loglike += T * ivector_feature.UbmLogLikePerFrame();
         tot_objf_impr += T * ivector_feature.ObjfImprPerFrame();
         tot_length_utt_end += T * ivectors.Row(num_ivectors - 1).Norm(2.0);
         for (int32 i = 0; i < num_ivectors; i++)
diff --git a/src/rnnlm/rnnlm-test-utils.cc b/src/rnnlm/rnnlm-test-utils.cc
index f415f257a06..32e8b5a4236 100644
--- a/src/rnnlm/rnnlm-test-utils.cc
+++ b/src/rnnlm/rnnlm-test-utils.cc
@@ -78,7 +78,7 @@ void ConvertToInteger(
   for (int i = 0; i < string_sentences.size(); i++) {
     (*int_sentences)[i].resize(string_sentences[i].size());
     for (int j = 0; j < string_sentences[i].size(); j++) {
-      int64 key = symbol_table.Find(string_sentences[i][j]);
+      kaldi::int64 key = symbol_table.Find(string_sentences[i][j]);
       KALDI_ASSERT(key != -1); // fst::kNoSymbol
       (*int_sentences)[i][j] = static_cast<int32>(key);
     }
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index cf88a408fcb..254d7ec36d8 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -538,6 +538,7 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
                             BaseFloat *obj_impr_out,
                             BaseFloat *smallest_split_change_out) {
   KALDI_ASSERT(num_leaves != NULL && *num_leaves > 0);  // can't be 0 or input_map would be empty.
+  int32 num_empty_leaves = 0;
   BaseFloat like_impr = 0.0;
   BaseFloat smallest_split_change = 1.0e+20;
   std::vector<DecisionTreeSplitter*> builders;
@@ -549,6 +550,7 @@ EventMap *SplitDecisionTree(const EventMap &input_map,
     builders.resize(split_stats.size());  // size == #leaves.
     for (size_t i = 0;i < split_stats.size();i++) {
       EventAnswerType leaf = static_cast<EventAnswerType>(i);
+      if (split_stats[i].size() == 0) num_empty_leaves++;
       builders[i] = new DecisionTreeSplitter(leaf, split_stats[i], q_opts);
     }
   }
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index 9726b5343ee..534f3352def 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -675,7 +675,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
 
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {  // A check.
     if (summed_stats[i] != NULL &&
-        !std::binary_search(phones.begin(), phones.end(), i)) {
+        !binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }
@@ -795,7 +795,7 @@ void KMeansClusterPhones(BuildTreeStatsType &stats,
   for (int32 i = 0; static_cast<size_t>(i) < summed_stats.size(); i++) {
     // just a check.
     if (summed_stats[i] != NULL &&
-        !std::binary_search(phones.begin(), phones.end(), i)) {
+        !binary_search(phones.begin(), phones.end(), i)) {
       KALDI_WARN << "Phone "<< i << " is present in stats but is not in phone list [make sure you intended this].";
     }
   }
diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc
index 8e3b463fe7a..bbaa5cda162 100644
--- a/src/tree/tree-renderer.cc
+++ b/src/tree/tree-renderer.cc
@@ -67,7 +67,7 @@ TreeRenderer::MakeEdgeLabel(const EventKeyType &key,
       oss << ", ";
     if (key != kPdfClass) {
       std::string phone =
-          phone_syms_.Find(static_cast<int64>(*child));
+          phone_syms_.Find(static_cast<kaldi::int64>(*child));
       if (phone.empty())
         KALDI_ERR << "No phone found for Phone ID " << *child;
       oss << phone;
@@ -137,7 +137,7 @@ void TreeRenderer::RenderTable(const EventType *query, int32 id) {
         ExpectToken(is_, binary_, "NULL"); // consume the invalid/NULL entry
         continue;
       }
-      std::string phone = phone_syms_.Find(static_cast<int64>(t));
+      std::string phone = phone_syms_.Find(static_cast<kaldi::int64>(t));
       if (phone.empty())
           KALDI_ERR << "Phone ID found in a TableEventMap, but not in the "
                     << "phone symbol table! ID: " << t;
diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index 3613e44fc76..358e33e686a 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -351,8 +351,7 @@ void UnitTestTableSequentialInt32(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -385,8 +384,7 @@ void UnitTestTableSequentialBool(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -420,8 +418,7 @@ void UnitTestTableSequentialDouble(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -465,8 +462,7 @@ void UnitTestTableSequentialDoubleBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   if (binary) {
     KALDI_ASSERT(v2 == v);
@@ -515,8 +511,7 @@ void UnitTestTableSequentialInt32VectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -556,8 +551,7 @@ void UnitTestTableSequentialInt32PairVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -600,8 +594,7 @@ void UnitTestTableSequentialInt32VectorVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   KALDI_ASSERT(v2 == v);
 }
@@ -648,8 +641,7 @@ void UnitTestTableSequentialInt32Script(bool binary) {
     k2.push_back(sbr.Key());
     v2.push_back(sbr.Value());
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
 
   unlink("tmp.scp");
   for (size_t i = 0; i < script.size(); i++) {
@@ -692,8 +684,7 @@ void UnitTestTableSequentialDoubleMatrixBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Matrix<double>(sbr.Value()));
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -747,8 +738,7 @@ void UnitTestTableSequentialBaseFloatVectorBoth(bool binary, bool read_scp) {
     k2.push_back(sbr.Key());
     v2.push_back(new Vector<BaseFloat>(sbr.Value()));
   }
-  ans = sbr.Close();
-  KALDI_ASSERT(ans);
+  KALDI_ASSERT(sbr.Close());
   KALDI_ASSERT(k2 == k);
   if (binary) {
     for (size_t i = 0; i < v2.size(); i++)
@@ -841,11 +831,10 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
-      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value == v2);
+        KALDI_ASSERT(value == sbr.Value(cur_key));
       } else {
-        KALDI_ASSERT(ApproxEqual(value, v2));
+        KALDI_ASSERT(ApproxEqual(value, sbr.Value(cur_key)));
       }
     }
   }
@@ -1050,11 +1039,10 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
       }
-      auto v2 = sbr.Value(cur_key);
       if (binary) {
-        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 1.0e-10));
+        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 1.0e-10));
       } else {
-        KALDI_ASSERT(value_ptr->ApproxEqual(v2, 0.01));
+        KALDI_ASSERT(value_ptr->ApproxEqual(sbr.Value(cur_key), 0.01));
       }
     }
   }
diff --git a/src/util/kaldi-thread.cc b/src/util/kaldi-thread.cc
index 454524ac4fb..4573e24f1bb 100644
--- a/src/util/kaldi-thread.cc
+++ b/src/util/kaldi-thread.cc
@@ -22,14 +22,8 @@
 #include "util/kaldi-thread.h"
 
 namespace kaldi {
+int32 g_num_threads = 4;  // Initialize this global variable.
 
-int32 g_num_threads = 
-#ifdef KALDI_WASM
-0 // Everything will run on the main thread for WASM
-#else 
-8  // Initialize this global variable.
-#endif
-;
 MultiThreadable::~MultiThreadable() {
   // default implementation does nothing
 }
diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc
index b242130b8c7..a239b85aefc 100644
--- a/src/util/parse-options-test.cc
+++ b/src/util/parse-options-test.cc
@@ -120,7 +120,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My boolean");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -144,7 +144,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My string");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -186,7 +186,7 @@ void UnitTestParseOptions() {
     po4.Read(argc4, argv4);
     KALDI_ASSERT(val == "bar");
   }
-
+  
   try {   // test error with --float=string
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=foo"};
@@ -195,7 +195,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My float");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -208,7 +208,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -220,7 +220,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -232,7 +232,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected)xxx.";
   }
 
@@ -244,7 +244,7 @@ void UnitTestParseOptions() {
     po4.Register("option", &val, "My bool");
     po4.Read(argc4, argv4);
     assert(false); // Should not reach this part of code.
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -258,7 +258,7 @@ void UnitTestParseOptions() {
     po4.Register("num", &num, "My int32 variable");
     po4.Read(argc4, argv4);
     KALDI_ASSERT(num == 0);
-  } catch(const std::exception &e) {
+  } catch(std::exception &e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
diff --git a/src/util/stl-utils-test.cc b/src/util/stl-utils-test.cc
index 3a54fc82bac..11781e2f938 100644
--- a/src/util/stl-utils-test.cc
+++ b/src/util/stl-utils-test.cc
@@ -148,9 +148,9 @@ void TestCopyMapValuesToVector() {
     CopyMapValuesToVector(mp, &v);
     KALDI_ASSERT(mp.size() == v.size());
     int i = 0;
-    for (auto iter = mp.begin(); iter != mp.end();
-         iter++, i++) {
-      KALDI_ASSERT(v[i] == iter->second);
+    for (std::map<int, int>::iterator iter = mp.begin(); iter != mp.end();
+         iter++) {
+      KALDI_ASSERT(v[i++] == iter->second);
     }
   }
 }
diff --git a/tools/.gitignore b/tools/.gitignore
index 7190069bb0d..ece5032a109 100644
--- a/tools/.gitignore
+++ b/tools/.gitignore
@@ -12,6 +12,7 @@ CLAPACK_include
 Miniconda3-latest-Linux-x86_64.sh
 OpenBLAS/
 bazel/
+clapack/
 cub
 cub-*/
 faster-rnnlm/
diff --git a/tools/Makefile b/tools/Makefile
index 596fac2aea4..1ab7bbb8d47 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -7,7 +7,9 @@ CC ?= gcc        # used for sph2pipe
 
 WGET ?= wget
 
-OPENFST_VERSION ?= 1.8.4
+# Note: OpenFst requires a relatively recent C++ compiler with C++17 support,
+# e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
+OPENFST_VERSION ?= 1.8.0
 CUB_VERSION ?= 1.8.0
 # No '?=', since there exists only one version of sph2pipe.
 SPH2PIPE_VERSION = 2.5
@@ -76,25 +78,18 @@ else ifeq ($(OS),Windows_NT)
   # This new OS path is confirmed working on Windows 10 / Cygwin64.
   openfst_add_CXXFLAGS = -g -O2 -Wa,-mbig-obj
 else
-  openfst_add_CXXFLAGS = -g -O2
+  openfst_add_CXXFLAGS = -g -O3 -msse -msse2
 endif
 
 openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)
 	cd openfst-$(OPENFST_VERSION)/ && \
+        autoreconf -i && \
 	./configure --prefix=`pwd` $(OPENFST_CONFIGURE) CXX="$(CXX)" \
 		CXXFLAGS="$(openfst_add_CXXFLAGS) $(CXXFLAGS)" \
 		LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 
-openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
-	tar xozf openfst-$(OPENFST_VERSION).tar.gz
-
-openfst-$(OPENFST_VERSION).tar.gz:
-	if [ -d "$(DOWNLOAD_DIR)" ]; then \
-	  cp -p "$(DOWNLOAD_DIR)/openfst-$(OPENFST_VERSION).tar.gz" .; \
-	else \
-	  $(WGET) -nv -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
-	  $(WGET) -nv -T 10 -t 3 -c https://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz; \
-	fi
+openfst-$(OPENFST_VERSION):
+	git clone --single-branch https://github.com/alphacep/openfst openfst-$(OPENFST_VERSION)
 
 openfst_cleaned:
 	-for d in openfst/ openfst-*/; do \
@@ -121,8 +116,6 @@ sclite sctk_made: sctk/.compiled
 
 sctk/.compiled: sctk
 	rm -f sctk/.compiled
-	sed -i -e '2s/^/#include <unistd.h>\n/' sctk/src/sclite/align.c
-	sed -i -e '99s/^/int TEXT_set_lang_prof(char *lprof);\n/' sctk/src/sclite/text.h
 	$(SCTK_MKENV) $(MAKE) -C sctk config
 	$(SCTK_MKENV) $(MAKE) -C sctk all doc
 	$(MAKE) -C sctk install
@@ -167,11 +160,6 @@ sph2pipe_v$(SPH2PIPE_VERSION)/Makefile: sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	rm -rf sph2pipe_v*
 	tar -xmzf sph2pipe-$(SPH2PIPE_VERSION).tar.gz
 	mv sph2pipe-$(SPH2PIPE_VERSION) sph2pipe_v$(SPH2PIPE_VERSION)
-	# Workaround for macOS bug <rdar://problem/19363342>
-	if [ `uname` = "Darwin" ]; then \
-		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/sph2pipe.c ; \
-		sed -i -e "s/#define _XOPEN_SOURCE 500/#define _XOPEN_SOURCE 600/g" sph2pipe_v$(SPH2PIPE_VERSION)/file_headers.c ; \
-	fi
 
 sph2pipe-$(SPH2PIPE_VERSION).tar.gz:
 	if [ -d "$(DOWNLOAD_DIR)" ]; then \
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 5b229b9a626..9fd9e091b3c 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -25,3 +25,5 @@ ${KALDI_ROOT}/src/tfrnnlmbin:\
 ${KALDI_ROOT}/src/cudadecoderbin:\
 ${KALDI_ROOT}/src/cudafeatbin:\
 $PATH
+# Required to load Openfst extensions like ngram-fst
+export LD_LIRBRARY_PATH=$LD_LIBRARY_PATH:${KALDI_ROOT}/tools/openfst/lib/fst
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 12504104b9a..0b91d79be00 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -87,68 +87,15 @@ if ! have awk; then
   add_packages gawk
 fi
 
-pythonok=false
-python3=false
-python27=false
-
-
-if ! have python2.7; then
-  echo "$0: python2.7 is not installed"
-else
-  echo "$0: python2.7 present"
-  python27=true
-  pythonok=true
-fi
-
 if ! have python3; then
   echo "$0: python3 is not installed"
   add_packages python3
-else
-  echo "$0: python3 present"
-  python3=true
-  pythonok=true
-fi
-
-(
-#Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
-[ -f ./env.sh ] && . ./env.sh
-rm -f $PWD/python/python*
-if ! [ -f $PWD/python/.use_default_python ]; then
-  echo "$0: Configuring python"
-  echo "$0: ... If you really want to avoid this, add an" \
-         "empty file $PWD/python/.use_default_python and run this script again."
-  if $python27 ; then
-    echo "$0: ... python2.7 found, making it default (python, python2, python2.7)"
-    ln -s $(command -v python2.7) $PWD/python/python
-    ln -s $(command -v python2.7) $PWD/python/python2
-    ln -s $(command -v python2.7) $PWD/python/python2.7
-  fi
-
-  if $python3 ; then
-    echo "$0: ... python3 found, making symlink (python3)"
-    ln -s $(command -v python3) $PWD/python/python3
-    if ! $python27 ; then
-      echo "$0: ... ... python2.7 not found, using python3 as python"
-      ln -s $(command -v python3) $PWD/python/python
-    fi
-  fi
-else
-  echo "$0: Not configuring python(s) -- using system defaults"
-  if ! have python ; then
-    echo "$0: WARNING: 'python' executable not present, configuring"
-    if $python27 ; then
-      ln -s $(command -v python2.7) $PWD/python/python
-    elif $python3 ; then
-      ln -s $(command -v python3) $PWD/python/python
-    fi
-  fi
+  pythonok=false
 fi
 
-)
-
 mathlib_missing=false
-case "$(uname -m)-$(uname -s)" in
-  x86_64*)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
+case $(uname -m) in
+  x86_64)  # Suggest MKL on an Intel64 system (not supported on i?86 hosts).
     # Respect user-supplied MKL_ROOT environment variable.
     MKL_ROOT="${MKL_ROOT:-/opt/intel/mkl}"
        # Check the well-known mkl.h file location.
@@ -167,9 +114,6 @@ case "$(uname -m)-$(uname -s)" in
       mathlib_missing=true
     fi
       ;;
-  arm64-Darwin)  ## Apple Silicon
-    echo "$0: Relying on Acceleration framework"
-    ;;
   *)  # Suggest OpenBLAS on other hardware.
     if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
          ! echo '#include <openblas_config.h>' |
diff --git a/tools/extras/install_cffi.sh b/tools/extras/install_cffi.sh
index 5ac9904173e..dc7f91724a7 100755
--- a/tools/extras/install_cffi.sh
+++ b/tools/extras/install_cffi.sh
@@ -35,7 +35,7 @@ echo "**** Installing Cffi and dependencies"
 
 echo "Checking for Python-Dev"
 # copied from https://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script
-if [ ! -e $(python -c 'from sysconfig import get_makefile_filename as m; print m()') ]; then
+if [ ! -e $(python -c 'from distutils.sysconfig import get_makefile_filename as m; print m()') ]; then
     echo "On Debian/Ubuntu like system install by 'sudo apt-get python-dev' package."
     echo "On Fedora by 'yum install python-devel'"
     echo "On Mac OS X by 'brew install python'"
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
index a8b1c284879..8d6ae4ab7b4 100755
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -8,9 +8,7 @@ if [ ! -f liblbfgs-$VER.tar.gz ]; then
   if [ -d "$DOWNLOAD_DIR" ]; then
     cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
   else
-    # only 1.10 supported
-    $WGET https://danielpovey.com/files/liblbfgs-$VER.tar.gz || exit 1
-    # $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
+    $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
   fi
 fi
 
@@ -39,3 +37,4 @@ cd ..
   echo "export LIBLBFGS=$wd/liblbfgs-1.10"
   echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
 ) >> env.sh
+
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index ddcd372a02c..8c1899bdf2f 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -16,7 +16,7 @@ default_package=intel-mkl-64bit-2020.0-088
 
 yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
 apt_repo='https://apt.repos.intel.com/mkl'
-intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
 
 Usage () {
   cat >&2 <<EOF
diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh
index e6e17716718..a76b98e2061 100755
--- a/tools/extras/install_mmseg.sh
+++ b/tools/extras/install_mmseg.sh
@@ -16,13 +16,13 @@ fi
 
 
 # Install python-devel package if not already available
-# first, makes sure sysconfig is usable
-if ! $(python -c "import sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure distutils.sysconfig usable
+if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 521d096adbd..328bbe7d299 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-OPENBLAS_VERSION=0.3.13
+OPENBLAS_VERSION=0.3.20
 
 WGET=${WGET:-wget}
 
@@ -32,7 +32,7 @@ fi
 tar xzf $tarball
 mv OpenMathLib-OpenBLAS-* OpenBLAS
 
-make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
+make PREFIX=$(pwd)/OpenBLAS/install DYNAMIC_ARCH=1 USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
 if [ $? -eq 0 ]; then
    echo "OpenBLAS is installed successfully."
    rm $tarball
diff --git a/tools/extras/install_openblas_clapack.sh b/tools/extras/install_openblas_clapack.sh
new file mode 100755
index 00000000000..8521263d573
--- /dev/null
+++ b/tools/extras/install_openblas_clapack.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+OPENBLAS_VERSION=0.3.20
+CLAPACK_VERSION=3.2.1
+
+git clone -b v${OPENBLAS_VERSION} --single-branch https://github.com/xianyi/OpenBLAS
+git clone -b v${CLAPACK_VERSION} --single-branch https://github.com/alphacep/clapack \
+
+make -C OpenBLAS ONLY_CBLAS=1 DYNAMIC_ARCH=1 TARGET=NEHALEM USE_LOCKING=1 USE_THREAD=0 NUM_THREADS=512 all
+make -C OpenBLAS PREFIX=$(pwd)/OpenBLAS/install install
+mkdir -p clapack/BUILD && cd clapack/BUILD && cmake .. \
+    && make -j 10 -C F2CLIBS \
+    && make -j 10 -C BLAS \
+    && make -j 10 -C SRC \
+    && find . -name "*.a" | xargs cp -t ../../OpenBLAS/install/lib
diff --git a/tools/extras/install_opengrm.sh b/tools/extras/install_opengrm.sh
index b8fdd48ba8d..add97274cfe 100755
--- a/tools/extras/install_opengrm.sh
+++ b/tools/extras/install_opengrm.sh
@@ -8,29 +8,29 @@
 
 echo "****() Installing OpenGrm"
 
-if [ ! -e ngram-1.3.7.tar.gz ]; then
-    echo "Could not find OpenGrm tarball ngram-1.3.7.tar.gz "
+if [ ! -e ngram-1.3.12.tar.gz ]; then
+    echo "Could not find OpenGrm tarball ngram-1.3.12.tar.gz "
     echo "Trying to download it via wget!"
 
     if ! which wget >&/dev/null; then
         echo "This script requires you to first install wget"
-        echo "You can also just download ngram-1.3.7.tar.gz from"
+        echo "You can also just download ngram-1.3.12.tar.gz from"
         echo "http://www.opengrm.org/twiki/bin/view/GRM/NGramDownload"
         exit 1;
     fi
 
-   wget -T 10 -t 3 -c http://www.opengrm.org/twiki/pub/GRM/NGramDownload/ngram-1.3.7.tar.gz
+   wget -T 10 -t 3 -c http://www.opengrm.org/twiki/pub/GRM/NGramDownload/ngram-1.3.12.tar.gz
 
-   if [ ! -e ngram-1.3.7.tar.gz ]; then
-        echo "Download of ngram-1.3.7.tar.gz - failed!"
+   if [ ! -e ngram-1.3.12.tar.gz ]; then
+        echo "Download of ngram-1.3.12.tar.gz - failed!"
         echo "Aborting script. Please download and install OpenGrm manually!"
     exit 1;
    fi
 fi
 
-tar -xovzf ngram-1.3.7.tar.gz|| exit 1
+tar -xovzf ngram-1.3.12.tar.gz|| exit 1
 
-cd ngram-1.3.7
+cd ngram-1.3.12
 OPENFSTPREFIX=`pwd`/../openfst
 LDFLAGS="-L${OPENFSTPREFIX}/lib" CXXFLAGS="-I${OPENFSTPREFIX}/include" ./configure --prefix ${OPENFSTPREFIX}
 make; make install
diff --git a/tools/extras/install_phonetisaurus.sh b/tools/extras/install_phonetisaurus.sh
index e407978972f..8a07c5f5ca5 100755
--- a/tools/extras/install_phonetisaurus.sh
+++ b/tools/extras/install_phonetisaurus.sh
@@ -15,16 +15,16 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure sysconfig is usable
+# first, makes sure distutils.sysconfig usable
 # We are not currently compiling the bindings by default, but it seems
 # worth it to keep this section as we do have them and they will
 # probably be used.
-if ! $(python -c "import sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 62b27e451ac..b70e6cbb447 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -15,13 +15,13 @@ fi
   echo "You must call this script from the tools/ directory" && exit 1;
 
 # Install python-devel package if not already available
-# first, makes sure sysconfig is usable
-if ! $(python -c "import sysconfig" &> /dev/null); then
-    echo "$0: WARNING: python library sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
+# first, makes sure distutils.sysconfig usable
+if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
+    echo "$0: WARNING: python library distutils.sysconfig not usable, this is necessary to figure out the path of Python.h." >&2
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "import sysconfig as s; print(s.get_path('include'))")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 1c69a8efb4e..813109dbb80 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -16,41 +16,30 @@ fi
 ! command -v gawk > /dev/null && \
    echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
 
-if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ] && [ ! -d srilm ]; then
-  if [ $# -ne 4 ]; then
-      echo "SRILM download requires some information about you"
-      echo
-      echo "Usage: $0 <name> <organization> <email> <address>"
-      exit 1
-  fi
-
-  srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download2.php"
-  post_data="file=1.7.3&name=$1&org=$2&email=$3&address=$4&license=on"
-
-  if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
-      echo 'There was a problem downloading the file.'
-      echo 'Check your internet connection and try again.'
-      exit 1
-  fi
-
-  if [ ! -s srilm.tar.gz ]; then
-      echo 'The file is empty. There was a problem downloading the file.'
-      exit 1
-  fi
+if [ $# -ne 3 ]; then
+    echo "SRILM download requires some information about you"
+    echo
+    echo "Usage: $0 <name> <organization> <email>"
+    exit 1
+fi
+
+srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
+post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
+
+if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
+    echo 'There was a problem downloading the file.'
+    echo 'Check you internet connection and try again.'
+    exit 1
 fi
 
 mkdir -p srilm
 cd srilm
 
-if [ -f ../srilm.tgz ]; then
-    tar -xvzf ../srilm.tgz || exit 1 # Old SRILM format
-elif [ -f ../srilm.tar.gz ]; then
-    tar -xvzf ../srilm.tar.gz || exit 1 # Changed format type from tgz to tar.gz
-fi
 
-if [ ! -f RELEASE ]; then
-    echo 'The file RELEASE does not exist. There was a problem extracting.'
-    exit 1
+if [ -f ../srilm.tgz ]; then
+    tar -xvzf ../srilm.tgz # Old SRILM format
+elif [  -f ../srilm.tar.gz ]; then
+    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
 fi
 
 major=`gawk -F. '{ print $1 }' RELEASE`