From 9521865df378b1176cba16ab75ff07cdb917b652 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 10 Oct 2022 09:20:28 +0800 Subject: [PATCH 1/3] Add utility for shallow fusion --- icefall/__init__.py | 2 + icefall/ngram_lm.py | 123 ++++++++++++++++++++++++++++++++++++++++++ test/test_ngram_lm.py | 58 ++++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 icefall/ngram_lm.py create mode 100755 test/test_ngram_lm.py diff --git a/icefall/__init__.py b/icefall/__init__.py index 0399c84592..618f47e6d8 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -65,3 +65,5 @@ subsequent_chunk_mask, write_error_stats, ) + +from .ngram_lm import NgramLm diff --git a/icefall/ngram_lm.py b/icefall/ngram_lm.py new file mode 100644 index 0000000000..26ecb49656 --- /dev/null +++ b/icefall/ngram_lm.py @@ -0,0 +1,123 @@ +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Tuple + +import kaldifst + + +class NgramLm: + def __init__( + self, + binary_fst_filename: str, + backoff_id: int, + ): + """ + Args: + binary_fst_filename: + Path to the binary FST. + backoff_id: + ID of the backoff symbol. + """ + lm = kaldifst.StdVectorFst.read(binary_fst_filename) + if not lm.is_ilabel_sorted: + kaldifst.arcsort(lm, sort_type="ilabel") + + self.lm = lm + self.backoff_id = backoff_id + + def _process_backoff_arcs( + self, + state: int, + cost: float, + ) -> List[Tuple[int, float]]: + """Similar to ProcessNonemitting() from Kaldi, this function + returns the list of states reachable from the given state via + backoff arcs. + + Args: + state: + The input state. + cost: + The cost of reaching the given state from the start state. + Returns: + Return a list, where each element contains a tuple with two entries: + - next_state + - cost of next_state + If there is no backoff arc leaving the input state, then return + an empty list. + """ + ans = [] + + next_state, next_cost = self._get_next_state_and_cost_without_backoff( + state=state, + label=self.backoff_id, + ) + if next_state is None: + return ans + ans.append((next_state, next_cost + cost)) + ans += self._process_backoff_arcs(next_state, next_cost + cost) + return ans + + def _get_next_state_and_cost_without_backoff( + self, state: int, label: int + ) -> Tuple[int, float]: + """TODO: Add doc.""" + arc_iter = kaldifst.ArcIterator(self.lm, state) + num_arcs = self.lm.num_arcs(state) + + # The LM is arc sorted by ilabel, so we use binary search below. + left = 0 + right = num_arcs - 1 + while left <= right: + mid = (left + right) // 2 + arc_iter.seek(mid) + arc = arc_iter.value + if arc.ilabel < label: + left = mid + 1 + elif arc.ilabel > label: + right = mid - 1 + else: + return arc.nextstate, arc.weight.value + + return None, None + + def get_next_state_and_cost( + self, + state: int, + label: int, + ) -> Tuple[List[int], List[float]]: + states = [state] + costs = [0] + + extra_states_costs = self._process_backoff_arcs( + state=state, + cost=0, + ) + + for s, c in extra_states_costs: + states.append(s) + costs.append(c) + + next_states = [] + next_costs = [] + for s, c in zip(states, costs): + ns, nc = self._get_next_state_and_cost_without_backoff(s, label) + if ns: + next_states.append(ns) + next_costs.append(c + nc) + + return next_states, next_costs diff --git a/test/test_ngram_lm.py b/test/test_ngram_lm.py new file mode 100755 index 0000000000..29c58cf13f --- /dev/null +++ b/test/test_ngram_lm.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2022 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import graphviz +import kaldifst + +from icefall import NgramLm + + +def generate_fst(filename: str): + s = """ +3 5 1 1 3.00464 +3 0 3 0 5.75646 +0 1 1 1 12.0533 +0 2 2 2 7.95954 +0 9.97787 +1 4 2 2 3.35436 +1 0 3 0 7.59853 +2 0 3 0 +4 2 3 0 7.43735 +4 0.551239 +5 4 2 2 0.804938 +5 1 3 0 9.67086 +""" + fst = kaldifst.compile(s=s, acceptor=False) + fst.write(filename) + fst_dot = kaldifst.draw(fst, acceptor=False, portrait=True) + source = graphviz.Source(fst_dot) + source.render(outfile=f"{filename}.svg") + + +def main(): + filename = "test.fst" + generate_fst(filename) + ngram_lm = NgramLm(filename, backoff_id=3) + for label in [1, 2, 3, 4, 5]: + print("---label---", label) + p = ngram_lm.get_next_state_and_cost(state=5, label=label) + print(p) + print("---") + + +if __name__ == "__main__": + main() From cba06e9ff826ccb9c0c201effb2d43bc97ad6c1c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Oct 2022 17:29:58 +0800 Subject: [PATCH 2/3] test batch size == 1 without shallow fusion --- .../beam_search.py | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 769cd2a1d8..9095e8d521 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -1539,3 +1539,81 @@ def fast_beam_search_with_nbest_rnn_rescoring( ans[key] = hyps return ans + + +def modified_beam_search2( + model: Transducer, + encoder_out: torch.Tensor, + beam: int = 4, +): + """ """ + + encoder_out = model.joiner.encoder_proj(encoder_out) + + assert encoder_out.ndim == 2, encoder_out.shape + blank_id = model.decoder.blank_id + unk_id = getattr(model, "unk_id", blank_id) + context_size = model.decoder.context_size + device = next(model.parameters()).device + + B = HypothesisList() + B.add( + Hypothesis( + ys=[blank_id] * context_size, + log_prob=torch.zeros(1, dtype=torch.float32, device=device), + ) + ) + + T = encoder_out.shape[0] + for t in range(T): + current_encoder_out = encoder_out[t : t + 1] + A = list(B) + B = HypothesisList() + + ys_log_probs = torch.cat( + [hyp.log_prob.reshape(1, 1) for hyp in A] + ) # (num_hyps, 1) + + decoder_input = torch.tensor( + [hyp.ys[-context_size:] for hyp in A], + device=device, + dtype=torch.int64, + ) # (num_hyps, context_size) + decoder_out = model.decoder(decoder_input, need_pad=False).squeeze(1) + decoder_out = model.joiner.decoder_proj(decoder_out) + + # decoder_out is of shape (num_hyps, joiner_dim) + current_encoder_out = current_encoder_out.repeat(len(A), 1) + # current_encoder_out is of shape (num_hyps, encoder_out_dim) + logits = model.joiner( + current_encoder_out, + decoder_out, + project_input=False, + ) # (num_hyps, vocab_size) + log_probs = logits.log_softmax(dim=-1) # (num_hyps, vocab_size) + log_probs.add_(ys_log_probs) + + vocab_size = log_probs.size(-1) + log_probs = log_probs.reshape(-1) + topk_log_probs, topk_indexes = log_probs.topk(beam) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + topk_hyp_indexes = (topk_indexes // vocab_size).tolist() + topk_token_indexes = (topk_indexes % vocab_size).tolist() + + for k in range(len(topk_hyp_indexes)): + hyp_idx = topk_hyp_indexes[k] + hyp = A[hyp_idx] + new_ys = hyp.ys[:] + + new_token = topk_token_indexes[k] + if new_token not in (blank_id, unk_id): + new_ys.append(new_token) + + new_log_prob = topk_log_probs[k] + new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob) + B.add(new_hyp) + + best_hyp = B.get_most_probable(length_norm=True) + return best_hyp.ys[context_size:] From 9e46dd0866d07a2efab178d6b18aaff899b5210a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Oct 2022 23:42:11 +0800 Subject: [PATCH 3/3] Use shallow fusion for modified-beam-search --- egs/librispeech/ASR/generate-lm.sh | 32 ++++++++++++ .../ASR/lstm_transducer_stateless2/decode.py | 33 ++++++++++++ .../beam_search.py | 28 ++++++++-- icefall/__init__.py | 2 +- icefall/ngram_lm.py | 51 +++++++++++++++++-- test/test_ngram_lm.py | 14 ++++- 6 files changed, 147 insertions(+), 13 deletions(-) create mode 100755 egs/librispeech/ASR/generate-lm.sh diff --git a/egs/librispeech/ASR/generate-lm.sh b/egs/librispeech/ASR/generate-lm.sh new file mode 100755 index 0000000000..9dc1055da8 --- /dev/null +++ b/egs/librispeech/ASR/generate-lm.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +lang_dir=data/lang_bpe_500 +if [ ! -f $lang_dir/bigram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 2 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/bigram.arpa +fi + +if [ ! -f $lang_dir/bigram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=2 \ + $lang_dir/bigram.arpa > $lang_dir/bigram.fst.txt +fi + +if [ ! -f $lang_dir/trigram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 3 \ + -text $lang_dir/transcript_tokens.txt \ + -lm $lang_dir/trigram.arpa +fi + +if [ ! -f $lang_dir/trigram.fst.txt ]; then + python3 -m kaldilm \ + --read-symbol-table="$lang_dir/tokens.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $lang_dir/trigram.arpa > $lang_dir/trigram.fst.txt +fi diff --git a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py index 21ae563cbe..0e2c8b9273 100755 --- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py +++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py @@ -115,10 +115,12 @@ greedy_search, greedy_search_batch, modified_beam_search, + modified_beam_search2, ) from librispeech import LibriSpeech from train import add_model_arguments, get_params, get_transducer_model +from icefall import NgramLm from icefall.checkpoint import ( average_checkpoints, average_checkpoints_with_averaged_model, @@ -315,6 +317,8 @@ def decode_one_batch( batch: dict, word_table: Optional[k2.SymbolTable] = None, decoding_graph: Optional[k2.Fsa] = None, + ngram_lm: Optional[NgramLm] = None, + ngram_lm_scale: float = 1.0, ) -> Dict[str, List[List[str]]]: """Decode one batch and return the result in a dict. The dict has the following format: @@ -448,6 +452,17 @@ def decode_one_batch( ) for hyp in sp.decode(hyp_tokens): hyps.append(hyp.split()) + elif params.decoding_method == "modified_beam_search2": + batch_size = encoder_out.size(0) + for i in range(batch_size): + encoder_out_i = encoder_out[i, : encoder_out_lens[i]] + hyp = modified_beam_search2( + model=model, + encoder_out=encoder_out_i, + ngram_lm=ngram_lm, + ngram_lm_scale=ngram_lm_scale, + ) + hyps.append(sp.decode(hyp).split()) else: batch_size = encoder_out.size(0) @@ -497,6 +512,8 @@ def decode_dataset( sp: spm.SentencePieceProcessor, word_table: Optional[k2.SymbolTable] = None, decoding_graph: Optional[k2.Fsa] = None, + ngram_lm: Optional[NgramLm] = None, + ngram_lm_scale: float = 1.0, ) -> Dict[str, List[Tuple[List[str], List[str]]]]: """Decode dataset. @@ -546,6 +563,8 @@ def decode_dataset( decoding_graph=decoding_graph, word_table=word_table, batch=batch, + ngram_lm=ngram_lm, + ngram_lm_scale=ngram_lm_scale, ) for name, hyps in hyps_dict.items(): @@ -631,6 +650,7 @@ def main(): "fast_beam_search_nbest_LG", "fast_beam_search_nbest_oracle", "modified_beam_search", + "modified_beam_search2", ) params.res_dir = params.exp_dir / params.decoding_method @@ -655,6 +675,7 @@ def main(): else: params.suffix += f"-context-{params.context_size}" params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}" + params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}" if params.use_averaged_model: params.suffix += "-use-averaged-model" @@ -768,6 +789,16 @@ def main(): model.to(device) model.eval() + # lm_filename = "bigram.fst.txt" + lm_filename = "trigram.fst.txt" + logging.info(f"lm filename: {lm_filename}") + ngram_lm = NgramLm( + str(params.lang_dir / lm_filename), + backoff_id=500, + is_binary=False, + ) + logging.info(f"num states: {ngram_lm.lm.num_states}") + if "fast_beam_search" in params.decoding_method: if params.decoding_method == "fast_beam_search_nbest_LG": lexicon = Lexicon(params.lang_dir) @@ -812,6 +843,8 @@ def main(): sp=sp, word_table=word_table, decoding_graph=decoding_graph, + ngram_lm=ngram_lm, + ngram_lm_scale=params.ngram_lm_scale, ) save_results( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 9095e8d521..36f159cf7f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -23,6 +23,7 @@ import torch from model import Transducer +from icefall import NgramLm, NgramLmStateCost from icefall.decode import Nbest, one_best_decoding from icefall.utils import add_eos, add_sos, get_texts @@ -656,6 +657,8 @@ class Hypothesis: # It contains only one entry. log_prob: torch.Tensor + state_cost: Optional[NgramLmStateCost] = None + @property def key(self) -> str: """Return a string representation of self.ys""" @@ -1544,12 +1547,14 @@ def fast_beam_search_with_nbest_rnn_rescoring( def modified_beam_search2( model: Transducer, encoder_out: torch.Tensor, + ngram_lm: NgramLm, + ngram_lm_scale: float, beam: int = 4, ): - """ """ - encoder_out = model.joiner.encoder_proj(encoder_out) + lm_scale = ngram_lm_scale + assert encoder_out.ndim == 2, encoder_out.shape blank_id = model.decoder.blank_id unk_id = getattr(model, "unk_id", blank_id) @@ -1561,6 +1566,7 @@ def modified_beam_search2( Hypothesis( ys=[blank_id] * context_size, log_prob=torch.zeros(1, dtype=torch.float32, device=device), + state_cost=NgramLmStateCost(ngram_lm), ) ) @@ -1571,7 +1577,10 @@ def modified_beam_search2( B = HypothesisList() ys_log_probs = torch.cat( - [hyp.log_prob.reshape(1, 1) for hyp in A] + [ + hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale + for hyp in A + ] ) # (num_hyps, 1) decoder_input = torch.tensor( @@ -1610,9 +1619,18 @@ def modified_beam_search2( new_token = topk_token_indexes[k] if new_token not in (blank_id, unk_id): new_ys.append(new_token) + state_cost = hyp.state_cost.forward_one_step(new_token) + else: + state_cost = hyp.state_cost - new_log_prob = topk_log_probs[k] - new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob) + # We only keep AM scores in new_hyp.log_prob + new_log_prob = ( + topk_log_probs[k] - hyp.state_cost.lm_score * lm_scale + ) + + new_hyp = Hypothesis( + ys=new_ys, log_prob=new_log_prob, state_cost=state_cost + ) B.add(new_hyp) best_hyp = B.get_most_probable(length_norm=True) diff --git a/icefall/__init__.py b/icefall/__init__.py index 618f47e6d8..122226fdc9 100644 --- a/icefall/__init__.py +++ b/icefall/__init__.py @@ -66,4 +66,4 @@ write_error_stats, ) -from .ngram_lm import NgramLm +from .ngram_lm import NgramLm, NgramLmStateCost diff --git a/icefall/ngram_lm.py b/icefall/ngram_lm.py index 26ecb49656..23185e35ab 100644 --- a/icefall/ngram_lm.py +++ b/icefall/ngram_lm.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple +from collections import defaultdict +from typing import List, Optional, Tuple import kaldifst @@ -22,17 +23,25 @@ class NgramLm: def __init__( self, - binary_fst_filename: str, + fst_filename: str, backoff_id: int, + is_binary: bool = False, ): """ Args: - binary_fst_filename: - Path to the binary FST. + fst_filename: + Path to the FST. backoff_id: ID of the backoff symbol. + is_binary: + True if the given file is a binary FST. """ - lm = kaldifst.StdVectorFst.read(binary_fst_filename) + if is_binary: + lm = kaldifst.StdVectorFst.read(fst_filename) + else: + with open(fst_filename, "r") as f: + lm = kaldifst.compile(f.read(), acceptor=False) + if not lm.is_ilabel_sorted: kaldifst.arcsort(lm, sort_type="ilabel") @@ -121,3 +130,35 @@ def get_next_state_and_cost( next_costs.append(c + nc) return next_states, next_costs + + +class NgramLmStateCost: + def __init__(self, ngram_lm: NgramLm, state_cost: Optional[dict] = None): + assert ngram_lm.lm.start == 0, ngram_lm.lm.start + self.ngram_lm = ngram_lm + if state_cost is not None: + self.state_cost = state_cost + else: + self.state_cost = defaultdict(lambda: float("inf")) + + # At the very beginning, we are at the start state with cost 0 + self.state_cost[0] = 0.0 + + def forward_one_step(self, label: int) -> "NgramLmStateCost": + state_cost = defaultdict(lambda: float("inf")) + for s, c in self.state_cost.items(): + next_states, next_costs = self.ngram_lm.get_next_state_and_cost( + s, + label, + ) + for ns, nc in zip(next_states, next_costs): + state_cost[ns] = min(state_cost[ns], c + nc) + + return NgramLmStateCost(ngram_lm=self.ngram_lm, state_cost=state_cost) + + @property + def lm_score(self) -> float: + if len(self.state_cost) == 0: + return float("-inf") + + return -1 * min(self.state_cost.values()) diff --git a/test/test_ngram_lm.py b/test/test_ngram_lm.py index 29c58cf13f..bbf6bd51c2 100755 --- a/test/test_ngram_lm.py +++ b/test/test_ngram_lm.py @@ -18,7 +18,7 @@ import graphviz import kaldifst -from icefall import NgramLm +from icefall import NgramLm, NgramLmStateCost def generate_fst(filename: str): @@ -46,13 +46,23 @@ def generate_fst(filename: str): def main(): filename = "test.fst" generate_fst(filename) - ngram_lm = NgramLm(filename, backoff_id=3) + ngram_lm = NgramLm(filename, backoff_id=3, is_binary=True) for label in [1, 2, 3, 4, 5]: print("---label---", label) p = ngram_lm.get_next_state_and_cost(state=5, label=label) print(p) print("---") + state_cost = NgramLmStateCost(ngram_lm) + s0 = state_cost.forward_one_step(1) + print(s0.state_cost) + + s1 = s0.forward_one_step(2) + print(s1.state_cost) + + s2 = s1.forward_one_step(2) + print(s2.state_cost) + if __name__ == "__main__": main()