diff --git a/tools/datasets/corpora.py b/tools/datasets/corpora.py index 35977b908..9056b8f97 100644 --- a/tools/datasets/corpora.py +++ b/tools/datasets/corpora.py @@ -141,7 +141,7 @@ def tokenize(self): [os.path.join(parent_folder, os.path.basename(url)) for url in self.urls] ) - cmd = f"python tools/preprocess_data.py \ + cmd = f"python tools/datasets/preprocess_data.py \ --input {jsonl_filepath} \ --output-prefix {parent_folder}/{self.name} \ --vocab {self.vocab_file} \