Skip to content

Commit 0b19aa0

Browse files
committed
Compute features of librispeech and musan.
1 parent 40eed74 commit 0b19aa0

8 files changed

+322
-7
lines changed

.pre-commit-config.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,19 @@ repos:
33
rev: 21.6b0
44
hooks:
55
- id: black
6+
args: [--line-length=80]
67

78
- repo: https://github.com/PyCQA/flake8
89
rev: 3.9.2
910
hooks:
1011
- id: flake8
12+
args: [--max-line-length=80]
1113

1214
- repo: https://github.com/pycqa/isort
1315
rev: 5.9.2
1416
hooks:
1517
- id: isort
18+
args: [--profile=black]
1619

1720
- repo: https://github.com/pre-commit/pre-commit-hooks
1821
rev: v4.0.1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file computes fbank features of the librispeech dataset.
5+
Its looks for manifests in the directory data/manifests
6+
and generated fbank features are saved in data/fbank.
7+
"""
8+
9+
import os
10+
import subprocess
11+
from contextlib import contextmanager
12+
from pathlib import Path
13+
14+
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
15+
from lhotse.recipes.utils import read_manifests_if_cached
16+
17+
18+
@contextmanager
19+
def get_executor():
20+
# We'll either return a process pool or a distributed worker pool.
21+
# Note that this has to be a context manager because we might use multiple
22+
# context manager ("with" clauses) inside, and this way everything will
23+
# free up the resources at the right time.
24+
try:
25+
# If this is executed on the CLSP grid, we will try to use the
26+
# Grid Engine to distribute the tasks.
27+
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
28+
# (see https://github.com/pzelasko/plz for reference)
29+
#
30+
# The following must be installed:
31+
# $ pip install dask distributed
32+
# $ pip install git+https://github.com/pzelasko/plz
33+
name = subprocess.check_output("hostname -f", shell=True, text=True)
34+
if name.strip().endswith(".clsp.jhu.edu"):
35+
import plz
36+
from distributed import Client
37+
38+
with plz.setup_cluster() as cluster:
39+
cluster.scale(80)
40+
yield Client(cluster)
41+
return
42+
except:
43+
pass
44+
# No need to return anything - compute_and_store_features
45+
# will just instantiate the pool itself.
46+
yield None
47+
48+
49+
def compute_fbank_librispeech():
50+
src_dir = Path("data/manifests")
51+
output_dir = Path("data/fbank")
52+
num_jobs = min(15, os.cpu_count())
53+
num_mel_bins = 80
54+
55+
dataset_parts = (
56+
"dev-clean",
57+
"dev-other",
58+
"test-clean",
59+
"test-other",
60+
"train-clean-100",
61+
"train-clean-360",
62+
"train-other-500",
63+
)
64+
manifests = read_manifests_if_cached(
65+
dataset_parts=dataset_parts, output_dir=src_dir
66+
)
67+
assert manifests is not None
68+
69+
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
70+
71+
with get_executor() as ex: # Initialize the executor only once.
72+
for partition, m in manifests.items():
73+
if (output_dir / f"cuts_{partition}.json.gz").is_file():
74+
print(f"{partition} already exists - skipping.")
75+
continue
76+
print("Processing", partition)
77+
cut_set = CutSet.from_manifests(
78+
recordings=m["recordings"], supervisions=m["supervisions"],
79+
)
80+
if "train" in partition:
81+
cut_set = (
82+
cut_set
83+
+ cut_set.perturb_speed(0.9)
84+
+ cut_set.perturb_speed(1.1)
85+
)
86+
cut_set = cut_set.compute_and_store_features(
87+
extractor=extractor,
88+
storage_path=f"{output_dir}/feats_{partition}",
89+
# when an executor is specified, make more partitions
90+
num_jobs=num_jobs if ex is None else 80,
91+
executor=ex,
92+
storage_type=LilcomHdf5Writer,
93+
)
94+
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
95+
96+
97+
if __name__ == "__main__":
98+
compute_fbank_librispeech()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file computes fbank features of the musan dataset.
5+
Its looks for manifests in the directory data/manifests
6+
and generated fbank features are saved in data/fbank.
7+
"""
8+
9+
import os
10+
import subprocess
11+
from contextlib import contextmanager
12+
from pathlib import Path
13+
14+
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer, combine
15+
from lhotse.recipes.utils import read_manifests_if_cached
16+
17+
18+
@contextmanager
19+
def get_executor():
20+
# We'll either return a process pool or a distributed worker pool.
21+
# Note that this has to be a context manager because we might use multiple
22+
# context manager ("with" clauses) inside, and this way everything will
23+
# free up the resources at the right time.
24+
try:
25+
# If this is executed on the CLSP grid, we will try to use the
26+
# Grid Engine to distribute the tasks.
27+
# Other clusters can also benefit from that, provided a cluster-specific wrapper.
28+
# (see https://github.com/pzelasko/plz for reference)
29+
#
30+
# The following must be installed:
31+
# $ pip install dask distributed
32+
# $ pip install git+https://github.com/pzelasko/plz
33+
name = subprocess.check_output("hostname -f", shell=True, text=True)
34+
if name.strip().endswith(".clsp.jhu.edu"):
35+
import plz
36+
from distributed import Client
37+
38+
with plz.setup_cluster() as cluster:
39+
cluster.scale(80)
40+
yield Client(cluster)
41+
return
42+
except:
43+
pass
44+
# No need to return anything - compute_and_store_features
45+
# will just instantiate the pool itself.
46+
yield None
47+
48+
49+
def compute_fbank_musan():
50+
src_dir = Path("data/manifests")
51+
output_dir = Path("data/fbank")
52+
num_jobs = min(15, os.cpu_count())
53+
num_mel_bins = 80
54+
55+
dataset_parts = (
56+
"music",
57+
"speech",
58+
"noise",
59+
)
60+
manifests = read_manifests_if_cached(
61+
dataset_parts=dataset_parts, output_dir=src_dir
62+
)
63+
assert manifests is not None
64+
65+
musan_cuts_path = output_dir / "cuts_musan.json.gz"
66+
67+
if musan_cuts_path.is_file():
68+
print(f"{musan_cuts_path} already exists - skipping")
69+
return
70+
71+
print("Extracting features for Musan")
72+
73+
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
74+
75+
with get_executor() as ex: # Initialize the executor only once.
76+
# create chunks of Musan with duration 5 - 10 seconds
77+
musan_cuts = (
78+
CutSet.from_manifests(
79+
recordings=combine(
80+
part["recordings"] for part in manifests.values()
81+
)
82+
)
83+
.cut_into_windows(10.0)
84+
.filter(lambda c: c.duration > 5)
85+
.compute_and_store_features(
86+
extractor=extractor,
87+
storage_path=f"{output_dir}/feats_musan",
88+
num_jobs=num_jobs if ex is None else 80,
89+
executor=ex,
90+
storage_type=LilcomHdf5Writer,
91+
)
92+
)
93+
musan_cuts.to_json(musan_cuts_path)
94+
95+
96+
if __name__ == "__main__":
97+
compute_fbank_musan()
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file downloads the librispeech dataset
5+
to the directory data/LibriSpeech.
6+
7+
It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh .
8+
"""
9+
10+
11+
from lhotse.recipes import download_librispeech
12+
13+
14+
def download_data():
15+
target_dir = "data"
16+
17+
download_librispeech(target_dir=target_dir, dataset_parts="librispeech")
18+
19+
20+
if __name__ == "__main__":
21+
download_data()

egs/librispeech/ASR/local/download_lm.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/usr/bin/env python3
22

33
# Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
4+
"""
5+
This file downloads librispeech LM files to data/lm
6+
"""
47

58
import gzip
69
import os
@@ -26,9 +29,7 @@ def download_lm():
2629
filename = target_dir / f
2730
if filename.is_file() is False:
2831
urlretrieve_progress(
29-
f"{url}/{f}",
30-
filename=filename,
31-
desc=f"Downloading {filename}",
32+
f"{url}/{f}", filename=filename, desc=f"Downloading {filename}",
3233
)
3334

3435
if ".gz" in str(filename):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file generates manifests for the librispeech dataset.
5+
It expects the dataset is saved in data/LibriSpeech
6+
and the generated manifests are saved in data/manifests.
7+
"""
8+
9+
import os
10+
from pathlib import Path
11+
12+
from lhotse.recipes import prepare_librispeech
13+
14+
15+
def prepare_librispeech_mainfest():
16+
corpus_dir = Path("data/LibriSpeech")
17+
output_dir = Path("data/manifests")
18+
num_jobs = min(15, os.cpu_count())
19+
20+
librispeech_manifests = prepare_librispeech(
21+
corpus_dir=corpus_dir,
22+
dataset_parts="auto",
23+
output_dir=output_dir,
24+
num_jobs=num_jobs,
25+
)
26+
27+
28+
if __name__ == "__main__":
29+
prepare_librispeech_mainfest()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file generates manifests for the musan dataset.
5+
It expects the dataset is saved in data/musan
6+
and the generated manifests are saved in data/manifests.
7+
"""
8+
9+
from pathlib import Path
10+
11+
from lhotse.recipes import prepare_musan
12+
13+
14+
def prepare_musan_mainfest():
15+
corpus_dir = Path("data/musan")
16+
output_dir = Path("data/manifests")
17+
18+
prepare_musan(corpus_dir=corpus_dir, output_dir=output_dir)
19+
20+
21+
if __name__ == "__main__":
22+
prepare_musan_mainfest()

egs/librispeech/ASR/prepare.sh

+48-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env bash
22

3-
43
set -eou pipefail
54

65
stage=-1
@@ -19,8 +18,53 @@ fi
1918
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
2019
echo "stage 0: Download data"
2120

22-
# If you have pre-downloaded it in /path/to/LibriSpeech
23-
# Just run: ln -sfv /path/to/LibriSpeech data/
21+
# If you have pre-downloaded it to /path/to/LibriSpeech,
22+
# you can create a symlink to avoid downloading it again:
23+
#
24+
# ln -sfv /path/to/LibriSpeech data/
25+
#
26+
2427
mkdir -p data/LibriSpeech
25-
# TODO
28+
29+
if [ ! -f data/LibriSpeech/train-other-500/.completed ]; then
30+
# It's compatible with kaldi's egs/librispeech/s5/local/download_and_untar.sh
31+
./local/download_data.py
32+
fi
33+
34+
# If you have pre-downloaded it to /path/to/musan,
35+
# you can create a symlink to avoid downloading it again:
36+
#
37+
# ln -s /path/to/musan data/
38+
#
39+
if [ ! -e data/musan ]; then
40+
wget https://www.openslr.org/resources/17/musan.tar.gz
41+
fi
42+
fi
43+
44+
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
45+
echo "Stage 1: Prepare librispeech manifest"
46+
# We assume that you have downloaded the librispeech corpus
47+
# to data/LibriSpeech
48+
mkdir -p data/manifests
49+
./local/prepare_librispeech_manifest.py
50+
fi
51+
52+
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
53+
echo "Stage 2: Prepare musan manifest"
54+
# We assume that you have downloaded the musan corpus
55+
# to data/musan
56+
mkdir -p data/manifests
57+
./local/prepare_musan_manifest.py
58+
fi
59+
60+
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
61+
echo "Stage 3: Compute fbank for librispeech"
62+
mkdir -p data/fbank
63+
./local/compute_fbank_librispeech.py
64+
fi
65+
66+
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
67+
echo "Stage 4: Compute fbank for librispeech"
68+
mkdir -p data/fbank
69+
./local/compute_fbank_musan.py
2670
fi

0 commit comments

Comments
 (0)