Skip to content

Commit fb6a57e

Browse files
authored
Increase the size of the context in the RNN-T decoder. (#153)
1 parent cb04c8a commit fb6a57e

16 files changed

+1103
-131
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
2+
3+
# See ../../LICENSE for clarification regarding multiple authors
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name: run-pre-trained-tranducer-stateless
18+
19+
on:
20+
push:
21+
branches:
22+
- master
23+
pull_request:
24+
types: [labeled]
25+
26+
jobs:
27+
run_pre_trained_transducer_stateless:
28+
if: github.event.label.name == 'ready' || github.event_name == 'push'
29+
runs-on: ${{ matrix.os }}
30+
strategy:
31+
matrix:
32+
os: [ubuntu-18.04]
33+
python-version: [3.7, 3.8, 3.9]
34+
torch: ["1.10.0"]
35+
torchaudio: ["0.10.0"]
36+
k2-version: ["1.9.dev20211101"]
37+
38+
fail-fast: false
39+
40+
steps:
41+
- uses: actions/checkout@v2
42+
with:
43+
fetch-depth: 0
44+
45+
- name: Setup Python ${{ matrix.python-version }}
46+
uses: actions/setup-python@v1
47+
with:
48+
python-version: ${{ matrix.python-version }}
49+
50+
- name: Install Python dependencies
51+
run: |
52+
python3 -m pip install --upgrade pip pytest
53+
# numpy 1.20.x does not support python 3.6
54+
pip install numpy==1.19
55+
pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
56+
pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
57+
58+
python3 -m pip install git+https://github.com/lhotse-speech/lhotse
59+
python3 -m pip install kaldifeat
60+
# We are in ./icefall and there is a file: requirements.txt in it
61+
pip install -r requirements.txt
62+
63+
- name: Install graphviz
64+
shell: bash
65+
run: |
66+
python3 -m pip install -qq graphviz
67+
sudo apt-get -qq install graphviz
68+
69+
- name: Download pre-trained model
70+
shell: bash
71+
run: |
72+
sudo apt-get -qq install git-lfs tree sox
73+
cd egs/librispeech/ASR
74+
mkdir tmp
75+
cd tmp
76+
git lfs install
77+
git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22
78+
cd ..
79+
tree tmp
80+
soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/*.wav
81+
ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/*.wav
82+
83+
- name: Run greedy search decoding
84+
shell: bash
85+
run: |
86+
export PYTHONPATH=$PWD:PYTHONPATH
87+
cd egs/librispeech/ASR
88+
./transducer_stateless/pretrained.py \
89+
--method greedy_search \
90+
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/exp/pretrained.pt \
91+
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/data/lang_bpe_500/bpe.model \
92+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1089-134686-0001.wav \
93+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1221-135766-0001.wav \
94+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1221-135766-0002.wav
95+
96+
- name: Run beam search decoding
97+
shell: bash
98+
run: |
99+
export PYTHONPATH=$PWD:$PYTHONPATH
100+
cd egs/librispeech/ASR
101+
./transducer_stateless/pretrained.py \
102+
--method beam_search \
103+
--beam-size 4 \
104+
--checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/exp/pretrained.pt \
105+
--bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/data/lang_bpe_500/bpe.model \
106+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1089-134686-0001.wav \
107+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1221-135766-0001.wav \
108+
./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-22/test_wavs/1221-135766-0002.wav

.github/workflows/run-pretrained.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
strategy:
3131
matrix:
3232
os: [ubuntu-18.04]
33-
python-version: [3.6, 3.7, 3.8, 3.9]
33+
python-version: [3.7, 3.8, 3.9]
3434
torch: ["1.10.0"]
3535
torchaudio: ["0.10.0"]
3636
k2-version: ["1.9.dev20211101"]

.github/workflows/test.yml

+13-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
# os: [ubuntu-18.04, macos-10.15]
3333
# disable macOS test for now.
3434
os: [ubuntu-18.04]
35-
python-version: [3.6, 3.7, 3.8, 3.9]
35+
python-version: [3.7, 3.8]
3636
torch: ["1.8.0", "1.10.0"]
3737
torchaudio: ["0.8.0", "0.10.0"]
3838
k2-version: ["1.9.dev20211101"]
@@ -106,6 +106,12 @@ jobs:
106106
if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
107107
cd ../transducer
108108
pytest -v -s
109+
110+
cd ../transducer_stateless
111+
pytest -v -s
112+
113+
cd ../transducer_lstm
114+
pytest -v -s
109115
fi
110116
111117
- name: Run tests
@@ -125,4 +131,10 @@ jobs:
125131
if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
126132
cd ../transducer
127133
pytest -v -s
134+
135+
cd ../transducer_stateless
136+
pytest -v -s
137+
138+
cd ../transducer_lstm
139+
pytest -v -s
128140
fi

README.md

+21-4
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,12 @@ We do provide a Colab notebook for this recipe.
3434

3535
### LibriSpeech
3636

37-
We provide 3 models for this recipe:
37+
We provide 4 models for this recipe:
3838

3939
- [conformer CTC model][LibriSpeech_conformer_ctc]
4040
- [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
41-
- [RNN-T Conformer model][LibriSpeech_transducer]
41+
- [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer]
42+
- [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless]
4243

4344
#### Conformer CTC Model
4445

@@ -62,9 +63,9 @@ The WER for this model is:
6263
We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd?usp=sharing)
6364

6465

65-
#### RNN-T Conformer model
66+
#### Transducer: Conformer encoder + LSTM decoder
6667

67-
Using Conformer as encoder.
68+
Using Conformer as encoder and LSTM as decoder.
6869

6970
The best WER with greedy search is:
7071

@@ -74,6 +75,21 @@ The best WER with greedy search is:
7475

7576
We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
7677

78+
#### Transducer: Conformer encoder + Embedding decoder
79+
80+
Using Conformer as encoder. The decoder consists of 1 embedding layer
81+
and 1 convolutional layer.
82+
83+
The best WER using beam search with beam size 4 is:
84+
85+
| | test-clean | test-other |
86+
|-----|------------|------------|
87+
| WER | 2.92 | 7.37 |
88+
89+
Note: No auxiliary losses are used in the training and no LMs are used
90+
in the decoding.
91+
92+
We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Lm37sNajIpkV4HTzMDF7sn9l0JpfmekN?usp=sharing)
7793

7894
### Aishell
7995

@@ -143,6 +159,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
143159
[LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc
144160
[LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
145161
[LibriSpeech_transducer]: egs/librispeech/ASR/transducer
162+
[LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless
146163
[Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
147164
[Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
148165
[TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc

egs/librispeech/ASR/README.md

+17
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
11

2+
# Introduction
3+
24
Please refer to <https://icefall.readthedocs.io/en/latest/recipes/librispeech.html>
35
for how to run models in this recipe.
6+
7+
# Transducers
8+
9+
There are various folders containing the name `transducer` in this folder.
10+
The following table lists the differences among them.
11+
12+
| | Encoder | Decoder |
13+
|------------------------|-----------|--------------------|
14+
| `transducer` | Conformer | LSTM |
15+
| `transducer_stateless` | Conformer | Embedding + Conv1d |
16+
| `transducer_lstm ` | LSTM | LSTM |
17+
18+
The decoder in `transducer_stateless` is modified from the paper
19+
[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
20+
We place an additional Conv1d layer right after the input embedding layer.

egs/librispeech/ASR/RESULTS.md

+62-3
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,77 @@
11
## Results
22

3-
### LibriSpeech BPE training results (RNN-T)
3+
### LibriSpeech BPE training results (Transducer)
4+
5+
#### 2021-12-22
6+
Conformer encoder + non-current decoder. The decoder
7+
contains only an embedding layer and a Conv1d (with kernel size 2).
8+
9+
The WERs are
10+
11+
| | test-clean | test-other | comment |
12+
|---------------------------|------------|------------|------------------------------------------|
13+
| greedy search | 2.99 | 7.52 | --epoch 20, --avg 10, --max-duration 100 |
14+
| beam search (beam size 2) | 2.95 | 7.43 | |
15+
| beam search (beam size 3) | 2.94 | 7.37 | |
16+
| beam search (beam size 4) | 2.92 | 7.37 | |
17+
| beam search (beam size 5) | 2.93 | 7.38 | |
18+
| beam search (beam size 8) | 2.92 | 7.38 | |
19+
20+
The training command for reproducing is given below:
21+
22+
```
23+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
24+
25+
./transducer_stateless/train.py \
26+
--world-size 4 \
27+
--num-epochs 30 \
28+
--start-epoch 0 \
29+
--exp-dir transducer_stateless/exp-full \
30+
--full-libri 1 \
31+
--max-duration 250 \
32+
--lr-factor 3
33+
```
34+
35+
The tensorboard training log can be found at
36+
<https://tensorboard.dev/experiment/PsJ3LgkEQfOmzedAlYfVeg/#scalars&_smoothingWeight=0>
37+
38+
The decoding command is:
39+
```
40+
epoch=20
41+
avg=10
42+
43+
## greedy search
44+
./transducer_stateless/decode.py \
45+
--epoch $epoch \
46+
--avg $avg \
47+
--exp-dir transducer_stateless/exp-full \
48+
--bpe-model ./data/lang_bpe_500/bpe.model \
49+
--max-duration 100
50+
51+
## beam search
52+
./transducer_stateless/decode.py \
53+
--epoch $epoch \
54+
--avg $avg \
55+
--exp-dir transducer_stateless/exp-full \
56+
--bpe-model ./data/lang_bpe_500/bpe.model \
57+
--max-duration 100 \
58+
--decoding-method beam_search \
59+
--beam-size 4
60+
```
61+
462

563
#### 2021-12-17
64+
Using commit `cb04c8a7509425ab45fae888b0ca71bbbd23f0de`.
665

7-
RNN-T + Conformer encoder
66+
Conformer encoder + LSTM decoder.
867

968
The best WER is
1069

1170
| | test-clean | test-other |
1271
|-----|------------|------------|
1372
| WER | 3.16 | 7.71 |
1473

15-
using `--epoch 26 --avg 12` during decoding with greedy search.
74+
using `--epoch 26 --avg 12` with **greedy search**.
1675

1776
The training command to reproduce the above WER is:
1877

egs/librispeech/ASR/transducer/model.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@
2727

2828
from icefall.utils import add_sos
2929

30-
assert hasattr(torchaudio.functional, "rnnt_loss"), (
31-
f"Current torchaudio version: {torchaudio.__version__}\n"
32-
"Please install a version >= 0.10.0"
33-
)
34-
3530

3631
class Transducer(nn.Module):
3732
"""It implements https://arxiv.org/pdf/1211.3711.pdf
@@ -115,6 +110,11 @@ def forward(
115110
# Note: y does not start with SOS
116111
y_padded = y.pad(mode="constant", padding_value=0)
117112

113+
assert hasattr(torchaudio.functional, "rnnt_loss"), (
114+
f"Current torchaudio version: {torchaudio.__version__}\n"
115+
"Please install a version >= 0.10.0"
116+
)
117+
118118
loss = torchaudio.functional.rnnt_loss(
119119
logits=logits,
120120
targets=y_padded,

egs/librispeech/ASR/transducer_lstm/model.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@
2727

2828
from icefall.utils import add_sos
2929

30-
assert hasattr(torchaudio.functional, "rnnt_loss"), (
31-
f"Current torchaudio version: {torchaudio.__version__}\n"
32-
"Please install a version >= 0.10.0"
33-
)
34-
3530

3631
class Transducer(nn.Module):
3732
"""It implements https://arxiv.org/pdf/1211.3711.pdf
@@ -115,6 +110,11 @@ def forward(
115110
# Note: y does not start with SOS
116111
y_padded = y.pad(mode="constant", padding_value=0)
117112

113+
assert hasattr(torchaudio.functional, "rnnt_loss"), (
114+
f"Current torchaudio version: {torchaudio.__version__}\n"
115+
"Please install a version >= 0.10.0"
116+
)
117+
118118
loss = torchaudio.functional.rnnt_loss(
119119
logits=logits,
120120
targets=y_padded,

0 commit comments

Comments
 (0)