-
Notifications
You must be signed in to change notification settings - Fork 321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add recipe for the yes_no dataset. #16
Merged
Merged
Changes from 2 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
f246f0c
Add recipe for the yes_no dataset.
csukuangfj 09587d1
Refactoring: Remove unused code.
csukuangfj 88166c5
Add Colab notebook for the yesno dataset.
csukuangfj f65525d
Add GitHub actions to run yesno.
csukuangfj 1bdfcb6
Fix a typo.
csukuangfj 3ffcd95
Minor fixes.
csukuangfj 6617d58
Train more epochs for GitHub actions.
csukuangfj 22dc936
Minor fixes.
csukuangfj 7edc0c6
Minor fixes.
csukuangfj b06f4cb
Merge remote-tracking branch 'dan/master' into yesno
csukuangfj c6e3e10
Fix style issues.
csukuangfj File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This script takes as input lang_dir and generates HLG from | ||
|
||
- H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt | ||
- L, the lexicon, built from lang_dir/L_disambig.pt | ||
|
||
Caution: We use a lexicon that contains disambiguation symbols | ||
|
||
- G, the LM, built from data/lm/G.fst.txt | ||
|
||
The generated HLG is saved in $lang_dir/HLG.pt | ||
""" | ||
import argparse | ||
import logging | ||
from pathlib import Path | ||
|
||
import k2 | ||
import torch | ||
|
||
from icefall.lexicon import Lexicon | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--lang-dir", | ||
type=str, | ||
help="""Input and output directory. | ||
""", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def compile_HLG(lang_dir: str) -> k2.Fsa: | ||
""" | ||
Args: | ||
lang_dir: | ||
The language directory, e.g., data/lang_phone or data/lang_bpe_5000. | ||
|
||
Return: | ||
An FSA representing HLG. | ||
""" | ||
lexicon = Lexicon(lang_dir) | ||
max_token_id = max(lexicon.tokens) | ||
logging.info(f"Building ctc_topo. max_token_id: {max_token_id}") | ||
H = k2.ctc_topo(max_token_id) | ||
L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt")) | ||
|
||
logging.info("Loading G.fst.txt") | ||
with open("data/lm/G.fst.txt") as f: | ||
G = k2.Fsa.from_openfst(f.read(), acceptor=False) | ||
|
||
first_token_disambig_id = lexicon.token_table["#0"] | ||
first_word_disambig_id = lexicon.word_table["#0"] | ||
|
||
L = k2.arc_sort(L) | ||
G = k2.arc_sort(G) | ||
|
||
logging.info("Intersecting L and G") | ||
LG = k2.compose(L, G) | ||
logging.info(f"LG shape: {LG.shape}") | ||
|
||
logging.info("Connecting LG") | ||
LG = k2.connect(LG) | ||
logging.info(f"LG shape after k2.connect: {LG.shape}") | ||
|
||
logging.info(type(LG.aux_labels)) | ||
logging.info("Determinizing LG") | ||
|
||
LG = k2.determinize(LG) | ||
logging.info(type(LG.aux_labels)) | ||
|
||
logging.info("Connecting LG after k2.determinize") | ||
LG = k2.connect(LG) | ||
|
||
logging.info("Removing disambiguation symbols on LG") | ||
|
||
LG.labels[LG.labels >= first_token_disambig_id] = 0 | ||
|
||
assert isinstance(LG.aux_labels, k2.RaggedInt) | ||
LG.aux_labels.values()[LG.aux_labels.values() >= first_word_disambig_id] = 0 | ||
|
||
LG = k2.remove_epsilon(LG) | ||
logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}") | ||
|
||
LG = k2.connect(LG) | ||
LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0) | ||
|
||
logging.info("Arc sorting LG") | ||
LG = k2.arc_sort(LG) | ||
|
||
logging.info("Composing H and LG") | ||
# CAUTION: The name of the inner_labels is fixed | ||
# to `tokens`. If you want to change it, please | ||
# also change other places in icefall that are using | ||
# it. | ||
HLG = k2.compose(H, LG, inner_labels="tokens") | ||
|
||
logging.info("Connecting LG") | ||
HLG = k2.connect(HLG) | ||
|
||
logging.info("Arc sorting LG") | ||
HLG = k2.arc_sort(HLG) | ||
logging.info(f"HLG.shape: {HLG.shape}") | ||
|
||
return HLG | ||
|
||
|
||
def main(): | ||
args = get_args() | ||
lang_dir = Path(args.lang_dir) | ||
|
||
if (lang_dir / "HLG.pt").is_file(): | ||
logging.info(f"{lang_dir}/HLG.pt already exists - skipping") | ||
return | ||
|
||
logging.info(f"Processing {lang_dir}") | ||
|
||
HLG = compile_HLG(lang_dir) | ||
logging.info(f"Saving HLG.pt to {lang_dir}") | ||
torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt") | ||
|
||
|
||
if __name__ == "__main__": | ||
formatter = ( | ||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
) | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
This file computes fbank features of the yesno dataset. | ||
Its looks for manifests in the directory data/manifests. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its -> It ? |
||
|
||
The generated fbank features are saved in data/fbank. | ||
""" | ||
|
||
import logging | ||
import os | ||
from pathlib import Path | ||
|
||
import torch | ||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer | ||
from lhotse.recipes.utils import read_manifests_if_cached | ||
|
||
from icefall.utils import get_executor | ||
|
||
# Torch's multithreaded behavior needs to be disabled or it wastes a lot of CPU and | ||
# slow things down. Do this outside of main() in case it needs to take effect | ||
# even when we are not invoking the main (e.g. when spawning subprocesses). | ||
torch.set_num_threads(1) | ||
torch.set_num_interop_threads(1) | ||
|
||
|
||
def compute_fbank_yesno(): | ||
src_dir = Path("data/manifests") | ||
output_dir = Path("data/fbank") | ||
|
||
# This dataset is rather small, so we use only one job | ||
num_jobs = min(1, os.cpu_count()) | ||
num_mel_bins = 23 | ||
|
||
dataset_parts = ( | ||
"train", | ||
"test", | ||
) | ||
manifests = read_manifests_if_cached( | ||
dataset_parts=dataset_parts, output_dir=src_dir | ||
) | ||
assert manifests is not None | ||
|
||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) | ||
|
||
with get_executor() as ex: # Initialize the executor only once. | ||
for partition, m in manifests.items(): | ||
if (output_dir / f"cuts_{partition}.json.gz").is_file(): | ||
logging.info(f"{partition} already exists - skipping.") | ||
continue | ||
logging.info(f"Processing {partition}") | ||
cut_set = CutSet.from_manifests( | ||
recordings=m["recordings"], | ||
supervisions=m["supervisions"], | ||
) | ||
if "train" in partition: | ||
cut_set = ( | ||
cut_set | ||
+ cut_set.perturb_speed(0.9) | ||
+ cut_set.perturb_speed(1.1) | ||
) | ||
cut_set = cut_set.compute_and_store_features( | ||
extractor=extractor, | ||
storage_path=f"{output_dir}/feats_{partition}", | ||
# when an executor is specified, make more partitions | ||
num_jobs=num_jobs if ex is None else 1, # use one job | ||
executor=ex, | ||
storage_type=LilcomHdf5Writer, | ||
) | ||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") | ||
|
||
|
||
if __name__ == "__main__": | ||
formatter = ( | ||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
) | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
|
||
compute_fbank_yesno() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to make the following k2 operations run on GPU if there are devices available?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the yesno dataset, the graphs are tiny. It's ok to run them on CPU.
For the librispeech dataset, I think it's worthwhile to have some benchmarks. If GPU is faster, we can switch to it.