k2-fsa
diff --git a/‎.github/workflows/test.yml
+8-4 b/‎.github/workflows/test.yml
+8-4
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎README.md
+20-2 b/‎README.md
+20-2
diff --git a/‎egs/librispeech/ASR/RESULTS.md
+46 b/‎egs/librispeech/ASR/RESULTS.md
+46
diff --git a/‎egs/librispeech/ASR/conformer_ctc/decode.py
-2 b/‎egs/librispeech/ASR/conformer_ctc/decode.py
-2
diff --git a/‎egs/librispeech/ASR/local/display_manifest_statistics.py
+215 b/‎egs/librispeech/ASR/local/display_manifest_statistics.py
+215
diff --git a/‎egs/librispeech/ASR/transducer/README.md
+19 b/‎egs/librispeech/ASR/transducer/README.md
+19
diff --git a/‎egs/librispeech/ASR/transducer/asr_datamodule.py
+1 b/‎egs/librispeech/ASR/transducer/asr_datamodule.py
+1
@@ -103,8 +103,10 @@ jobs:
           cd egs/librispeech/ASR/conformer_ctc
           pytest -v -s
 
-          cd ..
-          pytest -v -s ./transducer
+          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
+            cd ../transducer
+            pytest -v -s
+          fi
 
       - name: Run tests
         if: startsWith(matrix.os, 'macos')
@@ -120,5 +122,7 @@ jobs:
           cd egs/librispeech/ASR/conformer_ctc
           pytest -v -s
 
-          cd ..
-          pytest -v -s ./transducer
+          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
+            cd ../transducer
+            pytest -v -s
+          fi
@@ -8,3 +8,4 @@ exp*/
 download
 *.bak
 *-bak
+*bak.py
@@ -34,8 +34,11 @@ We do provide a Colab notebook for this recipe.
 
 ### LibriSpeech
 
-We provide two models for this recipe: [conformer CTC model][LibriSpeech_conformer_ctc]
-and [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc].
+We provide 3 models for this recipe:
+
+- [conformer CTC model][LibriSpeech_conformer_ctc]
+- [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
+- [RNN-T Conformer model][LibriSpeech_transducer]
 
 #### Conformer CTC Model
 
@@ -58,6 +61,20 @@ The WER for this model is:
 
 We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd?usp=sharing)
 
+
+#### RNN-T Conformer model
+
+Using Conformer as encoder.
+
+The best WER with greedy search is:
+
+|     | test-clean | test-other |
+|-----|------------|------------|
+| WER | 3.16       | 7.71       |
+
+We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
+
+
 ### Aishell
 
 We provide two models for this recipe: [conformer CTC model][Aishell_conformer_ctc]
@@ -125,6 +142,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 
 [LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc
 [LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
+[LibriSpeech_transducer]: egs/librispeech/ASR/transducer
 [Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
 [Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
 [TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
 
@@ -1,5 +1,51 @@
 ## Results
 
+### LibriSpeech BPE training results (RNN-T)
+
+#### 2021-12-17
+
+RNN-T + Conformer encoder
+
+The best WER is
+
+|     | test-clean | test-other |
+|-----|------------|------------|
+| WER | 3.16       | 7.71       |
+
+using `--epoch 26 --avg 12` during decoding with greedy search.
+
+The training command to reproduce the above WER is:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./transducer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 0 \
+  --exp-dir transducer/exp-lr-2.5-full \
+  --full-libri 1 \
+  --max-duration 250 \
+  --lr-factor 2.5
+```
+
+The decoding command is:
+
+```
+epoch=26
+avg=12
+
+./transducer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir transducer/exp-lr-2.5-full \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 100
+```
+
+You can find the tensorboard log at: <https://tensorboard.dev/experiment/PYIbeD6zRJez1ViXaRqqeg/>
+
+
 ### LibriSpeech BPE training results (Conformer-CTC)
 
 #### 2021-11-09
 
@@ -428,8 +428,6 @@ def decode_dataset(
       The first is the reference transcript, and the second is the
       predicted result.
     """
-    results = []
-
     num_cuts = 0
 
     try:
 
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file displays duration statistics of utterances in a manifest.
+You can use the displayed value to choose minimum/maximum duration
+to remove short and long utterances during the training.
+
+See the function `remove_short_and_long_utt()` in transducer/train.py
+for usage.
+"""
+
+
+from lhotse import load_manifest
+
+
+def main():
+    path = "./data/fbank/cuts_train-clean-100.json.gz"
+    path = "./data/fbank/cuts_train-clean-360.json.gz"
+    path = "./data/fbank/cuts_train-other-500.json.gz"
+    path = "./data/fbank/cuts_dev-clean.json.gz"
+    path = "./data/fbank/cuts_dev-other.json.gz"
+    path = "./data/fbank/cuts_test-clean.json.gz"
+    path = "./data/fbank/cuts_test-other.json.gz"
+
+    cuts = load_manifest(path)
+    cuts.describe()
+
+
+if __name__ == "__main__":
+    main()
+
+"""
+## train-clean-100
+Cuts count: 85617
+Total duration (hours): 303.8
+Speech duration (hours): 303.8 (100.0%)
+***
+Duration statistics (seconds):
+mean    12.8
+std     3.8
+min     1.3
+0.1%    1.9
+0.5%    2.2
+1%      2.5
+5%      4.2
+10%     6.4
+25%     11.4
+50%     13.8
+75%     15.3
+90%     16.7
+95%     17.3
+99%     18.1
+99.5%   18.4
+99.9%   18.8
+max     27.2
+
+## train-clean-360
+Cuts count: 312042
+Total duration (hours): 1098.2
+Speech duration (hours): 1098.2 (100.0%)
+***
+Duration statistics (seconds):
+mean    12.7
+std     3.8
+min     1.0
+0.1%    1.8
+0.5%    2.2
+1%      2.5
+5%      4.2
+10%     6.2
+25%     11.2
+50%     13.7
+75%     15.3
+90%     16.6
+95%     17.3
+99%     18.1
+99.5%   18.4
+99.9%   18.8
+max     33.0
+
+## train-other 500
+Cuts count: 446064
+Total duration (hours): 1500.6
+Speech duration (hours): 1500.6 (100.0%)
+***
+Duration statistics (seconds):
+mean    12.1
+std     4.2
+min     0.8
+0.1%    1.7
+0.5%    2.1
+1%      2.3
+5%      3.5
+10%     5.0
+25%     9.8
+50%     13.4
+75%     15.1
+90%     16.5
+95%     17.2
+99%     18.1
+99.5%   18.4
+99.9%   18.9
+max     31.0
+
+## dev-clean
+Cuts count: 2703
+Total duration (hours): 5.4
+Speech duration (hours): 5.4 (100.0%)
+***
+Duration statistics (seconds):
+mean    7.2
+std     4.7
+min     1.4
+0.1%    1.6
+0.5%    1.8
+1%      1.9
+5%      2.4
+10%     2.7
+25%     3.8
+50%     5.9
+75%     9.3
+90%     13.3
+95%     16.4
+99%     23.8
+99.5%   28.5
+99.9%   32.3
+max     32.6
+
+## dev-other
+Cuts count: 2864
+Total duration (hours): 5.1
+Speech duration (hours): 5.1 (100.0%)
+***
+Duration statistics (seconds):
+mean    6.4
+std     4.3
+min     1.1
+0.1%    1.3
+0.5%    1.7
+1%      1.8
+5%      2.2
+10%     2.6
+25%     3.5
+50%     5.3
+75%     7.9
+90%     12.0
+95%     15.0
+99%     22.2
+99.5%   27.1
+99.9%   32.4
+max     35.2
+
+## test-clean
+Cuts count: 2620
+Total duration (hours): 5.4
+Speech duration (hours): 5.4 (100.0%)
+***
+Duration statistics (seconds):
+mean    7.4
+std     5.2
+min     1.3
+0.1%    1.6
+0.5%    1.8
+1%      2.0
+5%      2.3
+10%     2.7
+25%     3.7
+50%     5.8
+75%     9.6
+90%     14.6
+95%     17.8
+99%     25.5
+99.5%   28.4
+99.9%   32.8
+max     35.0
+
+## test-other
+Cuts count: 2939
+Total duration (hours): 5.3
+Speech duration (hours): 5.3 (100.0%)
+***
+Duration statistics (seconds):
+mean    6.5
+std     4.4
+min     1.2
+0.1%    1.5
+0.5%    1.8
+1%      1.9
+5%      2.3
+10%     2.6
+25%     3.4
+50%     5.2
+75%     8.2
+90%     12.6
+95%     15.8
+99%     21.4
+99.5%   23.8
+99.9%   33.5
+max     34.5
+"""
@@ -0,0 +1,19 @@
+## Introduction
+
+The encoder consists of Conformer layers in this folder. You can use the
+following command to start the training:
+
+```bash
+cd egs/librispeech/ASR
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./transducer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 0 \
+  --exp-dir transducer/exp \
+  --full-libri 1 \
+  --max-duration 250 \
+  --lr-factor 2.5
+```
@@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py
-Original file line number
+Diff line change
 download
 *.bak
 *-bak
 +*bak.py