From 37c086e6a02eb032d9c39862be6024489bc74455 Mon Sep 17 00:00:00 2001
From: kouloumos <kouloumosa@gmail.com>
Date: Tue, 5 Dec 2023 11:58:38 +0200
Subject: [PATCH] cli: replace `preprocess-sources` with `preprocess`

preprocessing now also supports individual sources as input following
the same pattern as the `transcribe` command
---
 Readme.md            | 23 ++++++++-----
 app/transcription.py | 47 ++++++++++++++++++++------
 transcriber.py       | 80 ++++++++++++++++++++++++++++----------------
 3 files changed, 102 insertions(+), 48 deletions(-)

diff --git a/Readme.md b/Readme.md
index ce6ce69..b2ee34c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -9,15 +9,20 @@ This cli app transcribes audio and video for submission to the [bitcointranscrip
   - summarization `--summarize`
   - diarization `--diarize`
 
-**Features**:
-
-- Transcription using [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/)
-- Collection of video's metadata when sourcing from YouTube.
-- Open Pull Request on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
-- Save the resulting transcript to a markdown format supported by bitcointranscripts.
-- Upload the resulting transcript to an AWS S3 Bucket
-repo.
-- Push the resulting transcript to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend), or save the payload in a json for later use.
+**Transcription Workflow**
+
+This transcription tool operates through a structured four-stage process:
+
+1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds)
+2. Process: Downloads and converts sources for transcription preparation
+3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files.
+4. Postprocess: Offers multiple options for further actions:
+    - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
+    - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
+    - **Upload**: Saves transcripts in an AWS S3 Bucket.
+    - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
+    - **Save as JSON**: Preserves transcripts for future use.
+
 
 ## Prerequisites
 
diff --git a/app/transcription.py b/app/transcription.py
index dcbc033..adf68d3 100644
--- a/app/transcription.py
+++ b/app/transcription.py
@@ -26,7 +26,27 @@
 
 
 class Transcription:
-    def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deepgram=False, diarize=False, upload=False, model_output_dir="local_models/", nocleanup=False, queue=True, markdown=False, username=None, test_mode=False,  working_dir=None):
+    def __init__(
+        self,
+        model="tiny",
+        chapters=False,
+        pr=False,
+        summarize=False,
+        deepgram=False,
+        diarize=False,
+        upload=False,
+        model_output_dir="local_models/",
+        nocleanup=False,
+        queue=True,
+        markdown=False,
+        username=None,
+        test_mode=False,
+        working_dir=None,
+        batch_preprocessing_output=False
+    ):
+        self.logger = get_logger()
+        self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()
+
         self.model = model
         self.transcript_by = "username" if test_mode else self.__get_username()
         self.generate_chapters = chapters
@@ -44,8 +64,7 @@ def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deep
         self.markdown = markdown or test_mode
         self.existing_media = None
         self.test_mode = test_mode
-        self.logger = get_logger()
-        self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()
+        self.preprocessing_output = [] if batch_preprocessing_output else None
 
         self.logger.info(f"Temp directory: {self.tmp_dir}")
 
@@ -124,13 +143,18 @@ def check_if_youtube(source: Source):
     def _new_transcript_from_source(self, source: Source):
         """Helper method to initialize a new Transcript from source"""
         self.transcripts.append(Transcript(source, self.test_mode))
-        # Save preprocessed source
+
         if source.preprocess:
-            write_to_json(
-                source.to_json(),
-                f"{self.model_output_dir}/{source.loc}",
-                f"{source.title}_preprocess", is_metadata=True
-            )
+            if self.preprocessing_output is None:
+                # Save preprocessing output for each individual source
+                write_to_json(
+                    source.to_json(),
+                    f"{self.model_output_dir}/{source.loc}",
+                    f"{source.title}_preprocess", is_metadata=True
+                )
+            else:
+                # Keep preprocessing outputs for later use
+                self.preprocessing_output.append(source.to_json())
 
     def add_transcription_source(self, source_file, loc="misc", title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, link=None, chapters=None, nocheck=False, excluded_media=[]):
         """Add a source for transcription"""
@@ -185,7 +209,7 @@ def add_transcription_source(self, source_file, loc="misc", title=None, date=Non
                 f"{source.title}: sources added for transcription: {len(transcription_sources['added'])} (Ignored: {len(transcription_sources['exist'])} sources)")
         return transcription_sources
 
-    def add_transcription_source_JSON(self, json_file):
+    def add_transcription_source_JSON(self, json_file, nocheck=False):
         # validation checks
         check_if_valid_file_path(json_file)
         sources = check_if_valid_json(json_file)
@@ -210,7 +234,8 @@ def add_transcription_source_JSON(self, json_file):
                 youtube_metadata=metadata["youtube_metadata"],
                 chapters=metadata["chapters"],
                 link=metadata["media"],
-                excluded_media=metadata["excluded_media"]
+                excluded_media=metadata["excluded_media"],
+                nocheck=nocheck
             )
 
     def start(self, test_transcript=None):
diff --git a/transcriber.py b/transcriber.py
index b35eb32..19fa501 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -269,44 +269,68 @@ def transcribe(
 
 
 @cli.command()
-@click.argument("json_file", nargs=1)
+@click.argument("source", nargs=1)
 @click.option(
     "--nocheck",
     is_flag=True,
     default=False,
     help="Do not check for existing sources using btctranscripts.com/status.json",
 )
-def preprocess_sources(json_file, nocheck):
-    """Supply sources in a JSON file for preprocess. Preprocessing will fetch
-    all the given sources, and output them in a JSON alongside the available
-    metadata. The JSON can then be edited and piped to `transcribe-from-json`
+@click.option(
+    "--no-batched-output",
+    is_flag=True,
+    default=False,
+    help="Output preprocessing output in a different JSON file for each source",
+)
+# Options for adding metadata
+@add_title
+@add_date
+@add_tags
+@add_speakers
+@add_category
+@add_loc
+def preprocess(
+    source: str,
+    loc: str,
+    title: str,
+    date: str,
+    tags: list,
+    speakers: list,
+    category: list,
+    nocheck: bool,
+    no_batched_output: bool
+):
+    """Preprocess the provided sources. Suported sources include: \n
+    - YouTube videos and playlists\n
+    - JSON files containing individual sources\n
+
+    Preprocessing will fetch all the given sources, and output them
+    in a JSON alongside the available metadata.
+    The JSON can then be edited and piped to `transcribe`
     """
     try:
         configure_logger(log_level=logging.INFO)
-        check_if_valid_file_path(json_file)
-        transcription = Transcription()
-        with open(json_file, "r") as outfile:
-            sources = json.load(outfile)
-            outfile.close()
-        logger.info(f"Sources detected: {len(sources)}")
-        transcription_sources = []
-        for source in sources:
-            logger.info(f"Preprocessing {source['title']}: {source['source']}")
-            metadata = configure_metadata_given_from_JSON(source)
-            excluded_media = source.get(
-                "existing_entries_not_covered_by_btctranscripts/status.json", [])
-            excluded_media = [entry["media"] for entry in excluded_media]
-            transcription_source = transcription.add_transcription_source(
-                metadata['source_file'], loc=metadata['loc'],
-                tags=metadata['tags'], category=metadata['category'],
-                speakers=metadata['speakers'], nocheck=nocheck,
-                preprocess=True, excluded_media=excluded_media
+        logger.info(f"Preprocessing sources...")
+        transcription = Transcription(
+            batch_preprocessing_output=not no_batched_output)
+        if source.endswith(".json"):
+            transcription.add_transcription_source_JSON(source, nocheck=nocheck)
+        else:
+            transcription.add_transcription_source(
+                source_file=source,
+                loc=loc,
+                title=title,
+                date=date,
+                tags=tags,
+                category=category,
+                speakers=speakers,
+                preprocess=True,
+                nocheck=nocheck
             )
-            for transcription_source in transcription_source["added"]:
-                transcription_sources.append(transcription_source)
-        # Write all preprocessed sources to JSON
-        write_to_json([source.to_json() for source in transcription_sources],
-                      transcription.model_output_dir, "preprocessed_sources")
+        if not no_batched_output:
+            # Batch write all preprocessed sources to JSON
+            write_to_json([preprocessed_source for preprocessed_source in transcription.preprocessing_output],
+                          transcription.model_output_dir, "preprocessed_sources")
     except Exception as e:
         logger.error(e)
         logger.info(f"Exited with error")