From 595297229405fa74ec0dd53e0e7d0ce051802148 Mon Sep 17 00:00:00 2001 From: Xiaoyu Yang <45973641+marcoyang1998@users.noreply.github.com> Date: Sat, 17 Aug 2024 13:24:38 +0800 Subject: [PATCH] Keep the custom fields in libriheavy manifest (#1719) --- egs/libriheavy/ASR/local/prepare_manifest.py | 10 +++++++--- egs/libriheavy/ASR/prepare.sh | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/egs/libriheavy/ASR/local/prepare_manifest.py b/egs/libriheavy/ASR/local/prepare_manifest.py index 42f392cae4..d7e184d863 100755 --- a/egs/libriheavy/ASR/local/prepare_manifest.py +++ b/egs/libriheavy/ASR/local/prepare_manifest.py @@ -29,17 +29,21 @@ def simple_cleanup(text: str) -> str: # Assign text of the supervisions and remove unnecessary entries. def main(): - assert len(sys.argv) == 3, "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR" + assert ( + len(sys.argv) == 4 + ), "Usage: ./local/prepare_manifest.py INPUT OUTPUT_DIR KEEP_CUSTOM_FIELDS" fname = Path(sys.argv[1]).name oname = Path(sys.argv[2]) / fname + keep_custom_fields = bool(sys.argv[3]) with gzip.open(sys.argv[1], "r") as fin, gzip.open(oname, "w") as fout: for line in fin: cut = json.loads(line) cut["supervisions"][0]["text"] = simple_cleanup( cut["supervisions"][0]["custom"]["texts"][0] ) - del cut["supervisions"][0]["custom"] - del cut["custom"] + if not keep_custom_fields: + del cut["supervisions"][0]["custom"] + del cut["custom"] fout.write((json.dumps(cut) + "\n").encode()) diff --git a/egs/libriheavy/ASR/prepare.sh b/egs/libriheavy/ASR/prepare.sh index b0736c98ba..366a1459f4 100755 --- a/egs/libriheavy/ASR/prepare.sh +++ b/egs/libriheavy/ASR/prepare.sh @@ -29,6 +29,11 @@ export CUDA_VISIBLE_DEVICES="" # - speech dl_dir=$PWD/download +# If you want to do PromptASR experiments, please set it to True +# as this will keep the texts and pre_text information required for +# the training of PromptASR. +keep_custom_fields=False + . shared/parse_options.sh || exit 1 # vocab size for sentence piece models. @@ -134,7 +139,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then for subset in small medium large dev test_clean test_other; do if [ ! -e $manifests_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then log "Prepare manifest for subset : ${subset}" - ./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir + ./local/prepare_manifest.py $dl_dir/libriheavy/libriheavy_cuts_${subset}.jsonl.gz $manifests_dir $keep_custom_fields fi done fi