Skip to content

Commit

Permalink
cli: add postprocess-deepgram-transcript
Browse files Browse the repository at this point in the history
- extract postprocessing logic to a different method
- the new command allows to postprocess a deepgram transcript.
postprocessing involves the output from the preprocessing of the
original source and the output from deepgram
  • Loading branch information
kouloumos committed Dec 1, 2023
1 parent b6d006a commit 8d297df
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 88 deletions.
23 changes: 2 additions & 21 deletions app/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pytube.exceptions import PytubeError

from app import __app_name__, __version__
from app.utils import write_to_json
from app.logging import get_logger

logger = get_logger()
Expand Down Expand Up @@ -145,26 +144,9 @@ def combine_deepgram_chapters_with_diarization(deepgram_data, chapters):
logger.error(e)


def get_deepgram_transcript(deepgram_data, diarize, title, upload, model_output_dir):
def get_deepgram_transcript(deepgram_data, diarize):
logger = logging.getLogger(__app_name__)

def save_local_json(json_data, title, model_output_dir):
time_in_str = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
if not os.path.isdir(model_output_dir):
os.makedirs(model_output_dir)
file_path = os.path.join(
model_output_dir, title + "_" + time_in_str + ".json"
)
with open(file_path, "w") as json_file:
json.dump(json_data, json_file, indent=4)
logger.info(f"(deepgram) Model stored at: {file_path}")
return file_path
try:
data_path = write_to_json(
deepgram_data, model_output_dir, title)
logger.info(f"(deepgram) Model stored at: {data_path}")
if upload:
upload_file_to_s3(data_path)
if diarize:
logger.info(f"(deepgram) Processing diarization...")
para = ""
Expand Down Expand Up @@ -291,8 +273,7 @@ def combine_deepgram_with_chapters(deepgram_data, chapters):

return result
except Exception as e:
logger.error("Error combining deepgram with chapters")
logger.error(e)
raise Exception(f"Error combining deepgram with chapters: {e}")


def clean_up(tmp_dir):
Expand Down
20 changes: 18 additions & 2 deletions app/transcript.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import os
import re
import shutil
import tempfile
from datetime import datetime, date
Expand All @@ -17,7 +18,7 @@

from app import __app_name__, __version__, application
from app.logging import get_logger
from app.utils import slugify
from app.utils import slugify, write_to_json

logger = get_logger()

Expand Down Expand Up @@ -99,10 +100,18 @@ def write_chapters_file():
has_chapters = len(self.source.chapters) > 0
self.result = None
if service == "deepgram" or summarize_transcript:
# process mp3 using deepgram
deepgram_resp = application.process_mp3_deepgram(
self.audio_file, summarize_transcript, diarize)
# store deepgram output
deepgram_output_file_path = write_to_json(
deepgram_resp, model_output_dir, self.title, is_metadata=True)
self.logger.info(
f"(deepgram) Model stored at: {deepgram_output_file_path}")
if upload:
application.upload_file_to_s3(deepgram_output_file_path)
self.result = application.get_deepgram_transcript(
deepgram_resp, diarize, self.title, upload, model_output_dir)
deepgram_resp, diarize)

if summarize_transcript:
self.summary = application.get_deepgram_summary(
Expand Down Expand Up @@ -462,11 +471,18 @@ def extract_chapters_from_downloaded_video_metadata():
except Exception as e:
raise Exception(f"Error processing video file: {e}")

def __str__(self):
excluded_fields = ['logger']
fields = {key: value for key, value in self.__dict__.items()
if key not in excluded_fields}
return f"Video:{str(fields)}"

def to_json(self):
json_data = {
'type': self.type,
'loc': self.loc,
"source_file": self.source_file,
"media": self.media,
'title': self.title,
'categories': self.category,
'tags': self.tags,
Expand Down
74 changes: 42 additions & 32 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ def start(self, test_transcript=None):
output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
self.logger.info(
f"Processing source: {transcript.source.source_file}")
tmp_dir = self._create_subdirectory(
transcript.tmp_dir = self._create_subdirectory(
f"transcript{len(self.result) + 1}")
transcript.process_source(tmp_dir)
transcript.process_source(transcript.tmp_dir)
result = transcript.transcribe(
tmp_dir,
transcript.tmp_dir,
self.generate_chapters,
self.summarize_transcript,
self.service,
Expand All @@ -199,39 +199,49 @@ def start(self, test_transcript=None):
output_dir,
test_transcript=test_transcript
)
if self.markdown:
transcription_md_file = transcript.write_to_file(
output_dir if not self.test_mode else tmp_dir,
self.transcript_by)
self.result.append(transcription_md_file)
else:
self.result.append(result)
if self.open_pr:
application.create_pr(
absolute_path=transcription_md_file,
loc=transcript.source.source_file,
username=self.transcript_by,
curr_time=str(round(time.time() * 1000)),
title=transcript.title,
)
else:
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
if self.queuer:
self.queuer.push_to_queue(transcript_json)
else:
# store payload for the user to manually send it to the queuer
payload_json_file = write_to_json(
transcript_json,
f"{self.model_output_dir}/{transcript.source.loc}",
f"{transcript.title}_payload"
)
self.logger.info(
f"Transcript not added to the queue, payload stored at: {payload_json_file}")
postprocessed_transcript = self.postprocess(transcript)
self.result.append(postprocessed_transcript)

return self.result
except Exception as e:
raise Exception(f"Error with the transcription: {e}") from e

def postprocess(self, transcript: Transcript):
try:
result = transcript.result
output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
if self.markdown:
transcription_md_file = transcript.write_to_file(
output_dir if not self.test_mode else transcript.tmp_dir,
self.transcript_by)
result = transcription_md_file
if self.open_pr:
application.create_pr(
absolute_path=transcription_md_file,
loc=transcript.source.source_file,
username=self.transcript_by,
curr_time=str(round(time.time() * 1000)),
title=transcript.title,
)
elif not self.test_mode:
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via TBTBTC v{__version__}"
if self.queuer:
return self.queuer.push_to_queue(transcript_json)
else:
# store payload for the user to manually send it to the queuer
payload_json_file = write_to_json(
transcript_json,
output_dir,
f"{transcript.title}_payload"
)
self.logger.info(
f"Transcript not added to the queue, payload stored at: {payload_json_file}")
result = payload_json_file
return result
except Exception as e:
raise Exception(f"Error with postprocessing: {e}") from e

def clean_up(self):
self.logger.info("Cleaning up...")
application.clean_up(self.tmp_dir)
22 changes: 22 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,28 @@ def check_if_valid_file_path(file_path):
raise Exception(f"Not a valid file: {file_path}")


def configure_metadata_given_from_JSON(source):
"""Helper method that deals with missings fields from JSON
by assigning default values"""
try:
metadata = {}
# required in the JSON
metadata["source_file"] = source["source_file"]
metadata["title"] = source["title"]
# not required in the JSON
metadata["speakers"] = source.get("speakers", [])
metadata["category"] = source.get("categories", [])
metadata["tags"] = source.get("tags", [])
metadata["chapters"] = source.get("chapters", [])
metadata["loc"] = source.get("loc", "")
metadata["date"] = source.get("date", None)
metadata["youtube_metadata"] = source.get("youtube", None)
metadata["media"] = source.get("media", None)
return metadata
except KeyError as e:
raise Exception(f"Parsing JSON: {e} is required")


def get_status():
"""Helper method to fetch and store status.json locally"""
STATUS_FILE_PATH = "status.json" # the file path for storing the status locally
Expand Down
Loading

0 comments on commit 8d297df

Please sign in to comment.