From 37c086e6a02eb032d9c39862be6024489bc74455 Mon Sep 17 00:00:00 2001 From: kouloumos Date: Tue, 5 Dec 2023 11:58:38 +0200 Subject: [PATCH] cli: replace `preprocess-sources` with `preprocess` preprocessing now also supports individual sources as input following the same pattern as the `transcribe` command --- Readme.md | 23 ++++++++----- app/transcription.py | 47 ++++++++++++++++++++------ transcriber.py | 80 ++++++++++++++++++++++++++++---------------- 3 files changed, 102 insertions(+), 48 deletions(-) diff --git a/Readme.md b/Readme.md index ce6ce69..b2ee34c 100644 --- a/Readme.md +++ b/Readme.md @@ -9,15 +9,20 @@ This cli app transcribes audio and video for submission to the [bitcointranscrip - summarization `--summarize` - diarization `--diarize` -**Features**: - -- Transcription using [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) -- Collection of video's metadata when sourcing from YouTube. -- Open Pull Request on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. -- Save the resulting transcript to a markdown format supported by bitcointranscripts. -- Upload the resulting transcript to an AWS S3 Bucket -repo. -- Push the resulting transcript to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend), or save the payload in a json for later use. +**Transcription Workflow** + +This transcription tool operates through a structured four-stage process: + +1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds) +2. Process: Downloads and converts sources for transcription preparation +3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files. +4. Postprocess: Offers multiple options for further actions: + - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. + - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts. + - **Upload**: Saves transcripts in an AWS S3 Bucket. + - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend). + - **Save as JSON**: Preserves transcripts for future use. + ## Prerequisites diff --git a/app/transcription.py b/app/transcription.py index dcbc033..adf68d3 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -26,7 +26,27 @@ class Transcription: - def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deepgram=False, diarize=False, upload=False, model_output_dir="local_models/", nocleanup=False, queue=True, markdown=False, username=None, test_mode=False, working_dir=None): + def __init__( + self, + model="tiny", + chapters=False, + pr=False, + summarize=False, + deepgram=False, + diarize=False, + upload=False, + model_output_dir="local_models/", + nocleanup=False, + queue=True, + markdown=False, + username=None, + test_mode=False, + working_dir=None, + batch_preprocessing_output=False + ): + self.logger = get_logger() + self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp() + self.model = model self.transcript_by = "username" if test_mode else self.__get_username() self.generate_chapters = chapters @@ -44,8 +64,7 @@ def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deep self.markdown = markdown or test_mode self.existing_media = None self.test_mode = test_mode - self.logger = get_logger() - self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp() + self.preprocessing_output = [] if batch_preprocessing_output else None self.logger.info(f"Temp directory: {self.tmp_dir}") @@ -124,13 +143,18 @@ def check_if_youtube(source: Source): def _new_transcript_from_source(self, source: Source): """Helper method to initialize a new Transcript from source""" self.transcripts.append(Transcript(source, self.test_mode)) - # Save preprocessed source + if source.preprocess: - write_to_json( - source.to_json(), - f"{self.model_output_dir}/{source.loc}", - f"{source.title}_preprocess", is_metadata=True - ) + if self.preprocessing_output is None: + # Save preprocessing output for each individual source + write_to_json( + source.to_json(), + f"{self.model_output_dir}/{source.loc}", + f"{source.title}_preprocess", is_metadata=True + ) + else: + # Keep preprocessing outputs for later use + self.preprocessing_output.append(source.to_json()) def add_transcription_source(self, source_file, loc="misc", title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, link=None, chapters=None, nocheck=False, excluded_media=[]): """Add a source for transcription""" @@ -185,7 +209,7 @@ def add_transcription_source(self, source_file, loc="misc", title=None, date=Non f"{source.title}: sources added for transcription: {len(transcription_sources['added'])} (Ignored: {len(transcription_sources['exist'])} sources)") return transcription_sources - def add_transcription_source_JSON(self, json_file): + def add_transcription_source_JSON(self, json_file, nocheck=False): # validation checks check_if_valid_file_path(json_file) sources = check_if_valid_json(json_file) @@ -210,7 +234,8 @@ def add_transcription_source_JSON(self, json_file): youtube_metadata=metadata["youtube_metadata"], chapters=metadata["chapters"], link=metadata["media"], - excluded_media=metadata["excluded_media"] + excluded_media=metadata["excluded_media"], + nocheck=nocheck ) def start(self, test_transcript=None): diff --git a/transcriber.py b/transcriber.py index b35eb32..19fa501 100644 --- a/transcriber.py +++ b/transcriber.py @@ -269,44 +269,68 @@ def transcribe( @cli.command() -@click.argument("json_file", nargs=1) +@click.argument("source", nargs=1) @click.option( "--nocheck", is_flag=True, default=False, help="Do not check for existing sources using btctranscripts.com/status.json", ) -def preprocess_sources(json_file, nocheck): - """Supply sources in a JSON file for preprocess. Preprocessing will fetch - all the given sources, and output them in a JSON alongside the available - metadata. The JSON can then be edited and piped to `transcribe-from-json` +@click.option( + "--no-batched-output", + is_flag=True, + default=False, + help="Output preprocessing output in a different JSON file for each source", +) +# Options for adding metadata +@add_title +@add_date +@add_tags +@add_speakers +@add_category +@add_loc +def preprocess( + source: str, + loc: str, + title: str, + date: str, + tags: list, + speakers: list, + category: list, + nocheck: bool, + no_batched_output: bool +): + """Preprocess the provided sources. Suported sources include: \n + - YouTube videos and playlists\n + - JSON files containing individual sources\n + + Preprocessing will fetch all the given sources, and output them + in a JSON alongside the available metadata. + The JSON can then be edited and piped to `transcribe` """ try: configure_logger(log_level=logging.INFO) - check_if_valid_file_path(json_file) - transcription = Transcription() - with open(json_file, "r") as outfile: - sources = json.load(outfile) - outfile.close() - logger.info(f"Sources detected: {len(sources)}") - transcription_sources = [] - for source in sources: - logger.info(f"Preprocessing {source['title']}: {source['source']}") - metadata = configure_metadata_given_from_JSON(source) - excluded_media = source.get( - "existing_entries_not_covered_by_btctranscripts/status.json", []) - excluded_media = [entry["media"] for entry in excluded_media] - transcription_source = transcription.add_transcription_source( - metadata['source_file'], loc=metadata['loc'], - tags=metadata['tags'], category=metadata['category'], - speakers=metadata['speakers'], nocheck=nocheck, - preprocess=True, excluded_media=excluded_media + logger.info(f"Preprocessing sources...") + transcription = Transcription( + batch_preprocessing_output=not no_batched_output) + if source.endswith(".json"): + transcription.add_transcription_source_JSON(source, nocheck=nocheck) + else: + transcription.add_transcription_source( + source_file=source, + loc=loc, + title=title, + date=date, + tags=tags, + category=category, + speakers=speakers, + preprocess=True, + nocheck=nocheck ) - for transcription_source in transcription_source["added"]: - transcription_sources.append(transcription_source) - # Write all preprocessed sources to JSON - write_to_json([source.to_json() for source in transcription_sources], - transcription.model_output_dir, "preprocessed_sources") + if not no_batched_output: + # Batch write all preprocessed sources to JSON + write_to_json([preprocessed_source for preprocessed_source in transcription.preprocessing_output], + transcription.model_output_dir, "preprocessed_sources") except Exception as e: logger.error(e) logger.info(f"Exited with error")