-
Notifications
You must be signed in to change notification settings - Fork 295
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Yifan Yang
committed
Aug 7, 2023
1 parent
1ee251c
commit 1398e07
Showing
40 changed files
with
7,479 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
for ((avg=17; avg<=21; avg=$avg+1)); do | ||
./zipformer/decode.py \ | ||
--epoch 120 \ | ||
--avg $avg \ | ||
--exp-dir ./zipformer/exp \ | ||
--max-duration 2000 \ | ||
--decoding-method greedy_search | ||
done |
150 changes: 150 additions & 0 deletions
150
egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_splits.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2023 Xiaomi Corp. (Yifan Yang) | ||
# | ||
# See ../../../../LICENSE for clarification regarding multiple authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
import logging | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
import torch | ||
from lhotse import ( | ||
CutSet, | ||
KaldifeatFbank, | ||
KaldifeatFbankConfig, | ||
LilcomChunkyWriter, | ||
set_audio_duration_mismatch_tolerance, | ||
set_caching_enabled, | ||
) | ||
|
||
# Torch's multithreaded behavior needs to be disabled or | ||
# it wastes a lot of CPU and slow things down. | ||
# Do this outside of main() in case it needs to take effect | ||
# even when we are not invoking the main (e.g. when spawning subprocesses). | ||
torch.set_num_threads(1) | ||
torch.set_num_interop_threads(1) | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
"--num-workers", | ||
type=int, | ||
default=20, | ||
help="Number of dataloading workers used for reading the audio.", | ||
) | ||
|
||
parser.add_argument( | ||
"--batch-duration", | ||
type=float, | ||
default=600.0, | ||
help="The maximum number of audio seconds in a batch." | ||
"Determines batch size dynamically.", | ||
) | ||
|
||
parser.add_argument( | ||
"--num-splits", | ||
type=int, | ||
required=True, | ||
help="The number of splits of the train subset", | ||
) | ||
|
||
parser.add_argument( | ||
"--start", | ||
type=int, | ||
default=0, | ||
help="Process pieces starting from this number (inclusive).", | ||
) | ||
|
||
parser.add_argument( | ||
"--stop", | ||
type=int, | ||
default=-1, | ||
help="Stop processing pieces until this number (exclusive).", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def compute_fbank_bengaliai_speech_splits(args): | ||
num_splits = args.num_splits | ||
output_dir = f"data/fbank/bengaliai_speech_train_split" | ||
output_dir = Path(output_dir) | ||
assert output_dir.exists(), f"{output_dir} does not exist!" | ||
|
||
num_digits = 8 | ||
|
||
start = args.start | ||
stop = args.stop | ||
if stop < start: | ||
stop = num_splits | ||
|
||
stop = min(stop, num_splits) | ||
|
||
device = torch.device("cpu") | ||
if torch.cuda.is_available(): | ||
device = torch.device("cuda", 0) | ||
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) | ||
logging.info(f"device: {device}") | ||
|
||
set_audio_duration_mismatch_tolerance(0.01) # 10ms tolerance | ||
set_caching_enabled(False) | ||
|
||
for i in range(start, stop): | ||
idx = f"{i + 1}".zfill(num_digits) | ||
logging.info(f"Processing train split: {idx}") | ||
|
||
cuts_path = output_dir / f"bengaliai_speech_cuts_train.{idx}.jsonl.gz" | ||
if cuts_path.is_file(): | ||
logging.info(f"{cuts_path} exists - skipping") | ||
continue | ||
|
||
raw_cuts_path = output_dir / f"bengaliai_speech_cuts_train_raw.{idx}.jsonl.gz" | ||
|
||
logging.info(f"Loading {raw_cuts_path}") | ||
cut_set = CutSet.from_file(raw_cuts_path) | ||
|
||
logging.info("Splitting cuts into smaller chunks.") | ||
cut_set = cut_set.trim_to_supervisions( | ||
keep_overlapping=False, min_duration=None | ||
) | ||
|
||
logging.info("Computing features") | ||
cut_set = cut_set.compute_and_store_features_batch( | ||
extractor=extractor, | ||
storage_path=f"{output_dir}/bengaliai_speech_feats_train_{idx}", | ||
num_workers=args.num_workers, | ||
batch_duration=args.batch_duration, | ||
storage_type=LilcomChunkyWriter, | ||
overwrite=True, | ||
) | ||
|
||
logging.info(f"Saving to {cuts_path}") | ||
cut_set.to_file(cuts_path) | ||
|
||
|
||
def main(): | ||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
args = get_args() | ||
logging.info(vars(args)) | ||
compute_fbank_bengaliai_speech_splits(args) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
92 changes: 92 additions & 0 deletions
92
egs/bengaliai_speech/ASR/local/compute_fbank_bengaliai_speech_valid_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) | ||
# | ||
# See ../../../../LICENSE for clarification regarding multiple authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
""" | ||
This file computes fbank features of the Bengali.AI Speech dataset. | ||
It looks for manifests in the directory data/manifests. | ||
The generated fbank features are saved in data/fbank. | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import os | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import torch | ||
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig, LilcomChunkyWriter | ||
|
||
# Torch's multithreaded behavior needs to be disabled or | ||
# it wastes a lot of CPU and slow things down. | ||
# Do this outside of main() in case it needs to take effect | ||
# even when we are not invoking the main (e.g. when spawning subprocesses). | ||
torch.set_num_threads(1) | ||
torch.set_num_interop_threads(1) | ||
|
||
|
||
def compute_fbank_bengaliai_speech_valid_test(): | ||
src_dir = Path(f"data/manifests") | ||
output_dir = Path(f"data/fbank") | ||
num_workers = 42 | ||
batch_duration = 600 | ||
|
||
subsets = ("valid", "test") | ||
|
||
device = torch.device("cpu") | ||
if torch.cuda.is_available(): | ||
device = torch.device("cuda", 0) | ||
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) | ||
|
||
logging.info(f"device: {device}") | ||
|
||
for partition in subsets: | ||
cuts_path = output_dir / f"bengaliai_speech_cuts_{partition}.jsonl.gz" | ||
if cuts_path.is_file(): | ||
logging.info(f"{partition} already exists - skipping.") | ||
continue | ||
|
||
raw_cuts_path = output_dir / f"bengaliai_speech_cuts_{partition}_raw.jsonl.gz" | ||
|
||
logging.info(f"Loading {raw_cuts_path}") | ||
cut_set = CutSet.from_file(raw_cuts_path) | ||
|
||
logging.info("Splitting cuts into smaller chunks") | ||
cut_set = cut_set.trim_to_supervisions( | ||
keep_overlapping=False, min_duration=None | ||
) | ||
|
||
logging.info("Computing features") | ||
cut_set = cut_set.compute_and_store_features_batch( | ||
extractor=extractor, | ||
storage_path=f"{output_dir}/bengaliai_speech_feats_{partition}", | ||
num_workers=num_workers, | ||
batch_duration=batch_duration, | ||
storage_type=LilcomChunkyWriter, | ||
overwrite=True, | ||
) | ||
|
||
logging.info(f"Saving to {cuts_path}") | ||
cut_set.to_file(cuts_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
compute_fbank_bengaliai_speech_valid_test() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../librispeech/ASR/local/prepare_lang_bpe.py |
124 changes: 124 additions & 0 deletions
124
egs/bengaliai_speech/ASR/local/preprocess_bengaliai_speech.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env python3 | ||
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) | ||
# | ||
# See ../../../../LICENSE for clarification regarding multiple authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
import logging | ||
import re | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
from lhotse import CutSet, SupervisionSegment | ||
from lhotse.recipes.utils import read_manifests_if_cached | ||
|
||
|
||
def get_args(): | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
"--dataset", | ||
type=str, | ||
help="""Dataset parts to compute fbank. If None, we will use all""", | ||
) | ||
|
||
return parser.parse_args() | ||
|
||
|
||
def normalize_text(utt: str) -> str: | ||
opr_and_punc = "=\+\-\*\/%<>×÷" + "।,;:\?!'\.\"-\[\]\{\}\(\)–—―~" | ||
return re.sub(r"[{0}]+".format(opr_and_punc), "", utt).upper() | ||
|
||
|
||
def preprocess_bengaliai_speech( | ||
dataset: Optional[str] = None, | ||
): | ||
src_dir = Path(f"data/manifests") | ||
output_dir = Path(f"data/fbank") | ||
output_dir.mkdir(exist_ok=True) | ||
|
||
if dataset is None: | ||
dataset_parts = ( | ||
"train", | ||
"valid", | ||
"test", | ||
) | ||
else: | ||
dataset_parts = dataset.split(" ", -1) | ||
|
||
logging.info("Loading manifest") | ||
prefix = f"bengaliai_speech" | ||
suffix = "jsonl.gz" | ||
manifests = read_manifests_if_cached( | ||
dataset_parts=dataset_parts, | ||
output_dir=src_dir, | ||
suffix=suffix, | ||
prefix=prefix, | ||
) | ||
assert manifests is not None | ||
|
||
assert len(manifests) == len(dataset_parts), ( | ||
len(manifests), | ||
len(dataset_parts), | ||
list(manifests.keys()), | ||
dataset_parts, | ||
) | ||
|
||
for partition, m in manifests.items(): | ||
logging.info(f"Processing {partition}") | ||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}" | ||
if raw_cuts_path.is_file(): | ||
logging.info(f"{partition} already exists - skipping") | ||
continue | ||
|
||
logging.info(f"Normalizing text in {partition}") | ||
for sup in m["supervisions"]: | ||
if sup.text is None: | ||
continue | ||
text = str(sup.text) | ||
orig_text = text | ||
sup.text = normalize_text(sup.text) | ||
text = str(sup.text) | ||
if len(orig_text) != len(text): | ||
logging.info( | ||
f"\nOriginal text vs normalized text:\n{orig_text}\n{text}" | ||
) | ||
|
||
# Create long-recording cut manifests. | ||
cut_set = CutSet.from_manifests( | ||
recordings=m["recordings"], | ||
supervisions=m["supervisions"], | ||
).resample(16000) | ||
|
||
# Run data augmentation that needs to be done in the | ||
# time domain. | ||
logging.info(f"Saving to {raw_cuts_path}") | ||
cut_set.to_file(raw_cuts_path) | ||
|
||
|
||
def main(): | ||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" | ||
|
||
logging.basicConfig(format=formatter, level=logging.INFO) | ||
args = get_args() | ||
logging.info(vars(args)) | ||
preprocess_bengaliai_speech( | ||
dataset=args.dataset, | ||
) | ||
logging.info("Done") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../librispeech/ASR/local/train_bpe_model.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../../../librispeech/ASR/local/validate_bpe_lexicon.py |
Oops, something went wrong.