cli: add postprocess-deepgram-transcript

- extract postprocessing logic to a different method - the new command allows to postprocess a deepgram transcript. postprocessing involves the output from the preprocessing of the original source and the output from deepgram
bitcointranscripts · Dec 1, 2023 · 8d297df · 8d297df
1 parent b6d006a
commit 8d297df
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 88 deletions.
diff --git a/app/application.py b/app/application.py
@@ -25,7 +25,6 @@
 from pytube.exceptions import PytubeError
 
 from app import __app_name__, __version__
-from app.utils import write_to_json
 from app.logging import get_logger
 
 logger = get_logger()
@@ -145,26 +144,9 @@ def combine_deepgram_chapters_with_diarization(deepgram_data, chapters):
         logger.error(e)
 
 
-def get_deepgram_transcript(deepgram_data, diarize, title, upload, model_output_dir):
+def get_deepgram_transcript(deepgram_data, diarize):
     logger = logging.getLogger(__app_name__)
-
-    def save_local_json(json_data, title, model_output_dir):
-        time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-        if not os.path.isdir(model_output_dir):
-            os.makedirs(model_output_dir)
-        file_path = os.path.join(
-            model_output_dir, title + "_" + time_in_str + ".json"
-        )
-        with open(file_path, "w") as json_file:
-            json.dump(json_data, json_file, indent=4)
-        logger.info(f"(deepgram) Model stored at: {file_path}")
-        return file_path
     try:
-        data_path = write_to_json(
-            deepgram_data, model_output_dir, title)
-        logger.info(f"(deepgram) Model stored at: {data_path}")
-        if upload:
-            upload_file_to_s3(data_path)
         if diarize:
             logger.info(f"(deepgram) Processing diarization...")
             para = ""
@@ -291,8 +273,7 @@ def combine_deepgram_with_chapters(deepgram_data, chapters):
 
         return result
     except Exception as e:
-        logger.error("Error combining deepgram with chapters")
-        logger.error(e)
+        raise Exception(f"Error combining deepgram with chapters: {e}")
 
 
 def clean_up(tmp_dir):

diff --git a/app/transcript.py b/app/transcript.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import re
 import shutil
 import tempfile
 from datetime import datetime, date
@@ -17,7 +18,7 @@
 
 from app import __app_name__, __version__, application
 from app.logging import get_logger
-from app.utils import slugify
+from app.utils import slugify, write_to_json
 
 logger = get_logger()
 
@@ -99,10 +100,18 @@ def write_chapters_file():
             has_chapters = len(self.source.chapters) > 0
             self.result = None
             if service == "deepgram" or summarize_transcript:
+                # process mp3 using deepgram
                 deepgram_resp = application.process_mp3_deepgram(
                     self.audio_file, summarize_transcript, diarize)
+                # store deepgram output
+                deepgram_output_file_path = write_to_json(
+                    deepgram_resp, model_output_dir, self.title, is_metadata=True)
+                self.logger.info(
+                    f"(deepgram) Model stored at: {deepgram_output_file_path}")
+                if upload:
+                    application.upload_file_to_s3(deepgram_output_file_path)
                 self.result = application.get_deepgram_transcript(
-                    deepgram_resp, diarize, self.title, upload, model_output_dir)
+                    deepgram_resp, diarize)
 
                 if summarize_transcript:
                     self.summary = application.get_deepgram_summary(
@@ -462,11 +471,18 @@ def extract_chapters_from_downloaded_video_metadata():
         except Exception as e:
             raise Exception(f"Error processing video file: {e}")
 
+    def __str__(self):
+        excluded_fields = ['logger']
+        fields = {key: value for key, value in self.__dict__.items()
+                  if key not in excluded_fields}
+        return f"Video:{str(fields)}"
+
     def to_json(self):
         json_data = {
             'type': self.type,
             'loc': self.loc,
             "source_file": self.source_file,
+            "media": self.media,
             'title': self.title,
             'categories': self.category,
             'tags': self.tags,

diff --git a/app/transcription.py b/app/transcription.py
@@ -186,11 +186,11 @@ def start(self, test_transcript=None):
                 output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
                 self.logger.info(
                     f"Processing source: {transcript.source.source_file}")
-                tmp_dir = self._create_subdirectory(
+                transcript.tmp_dir = self._create_subdirectory(
                     f"transcript{len(self.result) + 1}")
-                transcript.process_source(tmp_dir)
+                transcript.process_source(transcript.tmp_dir)
                 result = transcript.transcribe(
-                    tmp_dir,
+                    transcript.tmp_dir,
                     self.generate_chapters,
                     self.summarize_transcript,
                     self.service,
@@ -199,39 +199,49 @@ def start(self, test_transcript=None):
                     output_dir,
                     test_transcript=test_transcript
                 )
-                if self.markdown:
-                    transcription_md_file = transcript.write_to_file(
-                        output_dir if not self.test_mode else tmp_dir,
-                        self.transcript_by)
-                    self.result.append(transcription_md_file)
-                else:
-                    self.result.append(result)
-                if self.open_pr:
-                    application.create_pr(
-                        absolute_path=transcription_md_file,
-                        loc=transcript.source.source_file,
-                        username=self.transcript_by,
-                        curr_time=str(round(time.time() * 1000)),
-                        title=transcript.title,
-                    )
-                else:
-                    transcript_json = transcript.to_json()
-                    transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
-                    if self.queuer:
-                        self.queuer.push_to_queue(transcript_json)
-                    else:
-                        # store payload for the user to manually send it to the queuer
-                        payload_json_file = write_to_json(
-                            transcript_json,
-                            f"{self.model_output_dir}/{transcript.source.loc}",
-                            f"{transcript.title}_payload"
-                        )
-                        self.logger.info(
-                            f"Transcript not added to the queue, payload stored at: {payload_json_file}")
+                postprocessed_transcript = self.postprocess(transcript)
+                self.result.append(postprocessed_transcript)
+
             return self.result
         except Exception as e:
             raise Exception(f"Error with the transcription: {e}") from e
 
+    def postprocess(self, transcript: Transcript):
+        try:
+            result = transcript.result
+            output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
+            if self.markdown:
+                transcription_md_file = transcript.write_to_file(
+                    output_dir if not self.test_mode else transcript.tmp_dir,
+                    self.transcript_by)
+                result = transcription_md_file
+            if self.open_pr:
+                application.create_pr(
+                    absolute_path=transcription_md_file,
+                    loc=transcript.source.source_file,
+                    username=self.transcript_by,
+                    curr_time=str(round(time.time() * 1000)),
+                    title=transcript.title,
+                )
+            elif not self.test_mode:
+                transcript_json = transcript.to_json()
+                transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
+                if self.queuer:
+                    return self.queuer.push_to_queue(transcript_json)
+                else:
+                    # store payload for the user to manually send it to the queuer
+                    payload_json_file = write_to_json(
+                        transcript_json,
+                        output_dir,
+                        f"{transcript.title}_payload"
+                    )
+                    self.logger.info(
+                        f"Transcript not added to the queue, payload stored at: {payload_json_file}")
+                    result = payload_json_file
+            return result
+        except Exception as e:
+                raise Exception(f"Error with postprocessing: {e}") from e
+
     def clean_up(self):
         self.logger.info("Cleaning up...")
         application.clean_up(self.tmp_dir)
diff --git a/app/utils.py b/app/utils.py
@@ -34,6 +34,28 @@ def check_if_valid_file_path(file_path):
         raise Exception(f"Not a valid file: {file_path}")
 
 
+def configure_metadata_given_from_JSON(source):
+    """Helper method that deals with missings fields from JSON
+    by assigning default values"""
+    try:
+        metadata = {}
+        # required in the JSON
+        metadata["source_file"] = source["source_file"]
+        metadata["title"] = source["title"]
+        # not required in the JSON
+        metadata["speakers"] = source.get("speakers", [])
+        metadata["category"] = source.get("categories", [])
+        metadata["tags"] = source.get("tags", [])
+        metadata["chapters"] = source.get("chapters", [])
+        metadata["loc"] = source.get("loc", "")
+        metadata["date"] = source.get("date", None)
+        metadata["youtube_metadata"] = source.get("youtube", None)
+        metadata["media"] = source.get("media", None)
+        return metadata
+    except KeyError as e:
+        raise Exception(f"Parsing JSON: {e} is required")
+
+
 def get_status():
     """Helper method to fetch and store status.json locally"""
     STATUS_FILE_PATH = "status.json"  # the file path for storing the status locally