Skip to content

Commit

Permalink
cli: finalize the postprocess command
Browse files Browse the repository at this point in the history
- support postprocessing for both whisper and deepgram
How this works:
- When a source is added for transcription it's preprocessed by default.
Preprocessing outputs a JSON file with all the available metadata.
- At the transcription stage, the output generated by the tranascription
is also stored as JSON and also referenced in the initial metadata JSON
- So having both JSON files available, the user can manually run
the postprocess stage.
  • Loading branch information
kouloumos committed Dec 7, 2023
1 parent 06a4bf7 commit 5566d65
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 113 deletions.
10 changes: 5 additions & 5 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ This transcription tool operates through a structured four-stage process:
2. Process: Downloads and converts sources for transcription preparation
3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts.
1. Converts audio to text.
- Preserves raw wisper transcript in SRT
- Preserves raw deepgram output in JSON
2. Summarize: Generates a summary of the transcript. [only available with deepgram]
3. Upload: Saves raw transcript files in an AWS S3 Bucket [optional]
4. Constructs the resulting transcript.
- Save as JSON: Preserves the output of the transcription service for future use.
- Save as SRT: Generates SRT file [whisper only]
2. Summarize: Generates a summary of the transcript. [deepgram only]
3. Upload: Saves transcription service output in an AWS S3 Bucket [optional]
4. Finalizes the resulting transcript.
- Process diarization. [deepgram only]
- Process chapters.
4. Postprocess: Offers multiple options for further actions:
Expand Down
99 changes: 65 additions & 34 deletions app/services/deepgram.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import mimetypes

import deepgram
Expand Down Expand Up @@ -45,14 +46,32 @@ def audio_to_text(self, audio_file):
except Exception as e:
raise Exception(f"(deepgram) Error transcribing audio to text: {e}")

def process_with_diarization_and_chapters(self, raw_transcript, chapters):
def write_to_json_file(self, transcription_service_output, transcript: Transcript):
transcription_service_output_file = utils.write_to_json(
transcription_service_output, f"{self.output_dir}/{transcript.source.loc}", transcript.title, is_metadata=True)
logger.info(
f"(deepgram) Model stored at: {transcription_service_output_file}")
# Add deepgram output file path to transcript's metadata file
if transcript.metadata_file is not None:
# Read existing content of the metadata file
with open(transcript.metadata_file, 'r') as file:
data = json.load(file)
# Add deepgram output
data['deepgram_output'] = transcription_service_output_file
# Write the updated dictionary back to the JSON file
with open(transcript.metadata_file, 'w') as file:
json.dump(data, file, indent=4)

return transcription_service_output_file

def process_with_diarization_and_chapters(self, transcription_service_output, chapters):
logger.info(
"(deepgram) Processing diarization with detected chapters...")
try:
para = ""
string = ""
curr_speaker = None
words = raw_transcript["results"]["channels"][0]["alternatives"][0][
words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]
words_pointer = 0
Expand Down Expand Up @@ -102,12 +121,12 @@ def process_with_diarization_and_chapters(self, raw_transcript, chapters):
except Exception as e:
raise Exception(f"Error combining deepgram chapters: {e}")

def process_with_diarization(self, raw_transcript):
def process_with_diarization(self, transcription_service_output):
logger.info(f"(deepgram) Processing diarization...")
para = ""
string = ""
curr_speaker = None
for word in raw_transcript["results"]["channels"][0]["alternatives"][0][
for word in transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]:
if word["speaker"] != curr_speaker:
Expand All @@ -127,13 +146,13 @@ def process_with_diarization(self, raw_transcript):
string = string + para
return string

def process_with_chapters(self, raw_transcript, chapters):
def process_with_chapters(self, transcription_service_output, chapters):
logger.info("(deepgram) Combining transcript with detected chapters...")
try:
chapters_pointer = 0
words_pointer = 0
result = ""
words = raw_transcript["results"]["channels"][0]["alternatives"][0][
words = transcription_service_output["results"]["channels"][0]["alternatives"][0][
"words"
]
# chapters index, start time, name
Expand Down Expand Up @@ -163,9 +182,12 @@ def process_with_chapters(self, raw_transcript, chapters):
except Exception as e:
raise Exception(f"Error combining deepgram with chapters: {e}")

def process_summary(self, raw_transcript):
def process_summary(self, transcript: Transcript):
with open(transcript.transcription_service_output_file, "r") as outfile:
transcription_service_output = json.load(outfile)

try:
summaries = raw_transcript["results"]["channels"][0]["alternatives"][0][
summaries = transcription_service_output["results"]["channels"][0]["alternatives"][0][
"summaries"
]
summary = ""
Expand All @@ -175,39 +197,48 @@ def process_summary(self, raw_transcript):
except Exception as e:
logger.error(f"Error getting summary: {e}")

def construct_transcript(self, raw_transcript, chapters):
if len(chapters) > 0:
# With chapters
if self.diarize:
# With diarization
return self.process_with_diarization_and_chapters(raw_transcript, chapters)
else:
# Without diarization
return self.process_with_chapters(raw_transcript, chapters)
else:
# Without chapters
if self.diarize:
# With diarization
return self.process_with_diarization(raw_transcript)
def finalize_transcript(self, transcript: Transcript):
try:
with open(transcript.transcription_service_output_file, "r") as outfile:
transcription_service_output = json.load(outfile)

has_diarization = any(
'speaker' in word for word in transcription_service_output['results']['channels'][0]['alternatives'][0]['words'])
has_chapters = len(transcript.source.chapters) > 0

if has_chapters:
# With chapters
if has_diarization:
# With diarization
return self.process_with_diarization_and_chapters(transcription_service_output, chapters)
else:
# Without diarization
return self.process_with_chapters(transcription_service_output, transcript.source.chapters)
else:
# Without diarization
return raw_transcript["results"]["channels"][0]["alternatives"][0]["transcript"]
# Without chapters
if has_diarization:
# With diarization
return self.process_with_diarization(transcription_service_output)
else:
# Without diarization
return transcription_service_output["results"]["channels"][0]["alternatives"][0]["transcript"]

return result
return result
except Exception as e:
raise Exception(f"(deepgram) Error finalizing transcript: {e}")

def transcribe(self, transcript: Transcript):
try:
raw_transcript = self.audio_to_text(transcript.audio_file)
raw_transcript_file = utils.write_to_json(
raw_transcript, f"{self.output_dir}/{transcript.source.loc}", transcript.title, is_metadata=True)
logger.info(
f"(deepgram) Model stored at: {raw_transcript_file}")
transcription_service_output = self.audio_to_text(
transcript.audio_file)
transcript.transcription_service_output_file = self.write_to_json_file(
transcription_service_output, transcript)
if self.upload:
application.upload_file_to_s3(raw_transcript_file)
application.upload_file_to_s3(
transcript.transcription_service_output_file)
if self.summarize:
transcript.summary = self.process_summary(raw_transcript)
transcript.result = self.construct_transcript(
raw_transcript, transcript.source.chapters)
transcript.summary = self.process_summary(transcript)
transcript.result = self.finalize_transcript(transcript)

return transcript
except Exception as e:
Expand Down
89 changes: 54 additions & 35 deletions app/services/whisper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import whisper

from app import (
Expand All @@ -22,15 +24,31 @@ def audio_to_text(self, audio_file):
try:
my_model = whisper.load_model(self.model)
result = my_model.transcribe(audio_file)
data = []
for x in result["segments"]:
data.append(tuple((x["start"], x["end"], x["text"])))
return data

return result
except Exception as e:
logger.error(
f"(wisper,{service}) Error transcribing audio to text: {e}")
return

def write_to_json_file(self, transcription_service_output, transcript: Transcript):
transcription_service_output_file = utils.write_to_json(
transcription_service_output, f"{self.output_dir}/{transcript.source.loc}", transcript.title, is_metadata=True)
logger.info(
f"(whisper) Model stored at: {transcription_service_output_file}")
# Add whisper output file path to transcript's metadata file
if transcript.metadata_file is not None:
# Read existing content of the metadata file
with open(transcript.metadata_file, 'r') as file:
data = json.load(file)
# Add whisper output
data['whisper_output'] = transcription_service_output_file
# Write the updated dictionary back to the JSON file
with open(transcript.metadata_file, 'w') as file:
json.dump(data, file, indent=4)

return transcription_service_output_file

def generate_srt(self, data, filename, loc):
def format_time(time):
hours = int(time / 3600)
Expand All @@ -42,71 +60,72 @@ def format_time(time):
output_file = f"{utils.configure_output_file_path(f'{self.output_dir}/{loc}', filename, is_metadata=True)}.srt"
logger.info(f"(whisper) Writing srt to {output_file}...")
with open(output_file, "w") as f:
for index, segment in enumerate(data):
start_time, end_time, text = segment
for index, segment in enumerate(data["segments"]):
f.write(f"{index+1}\n")
f.write(
f"{format_time(start_time)} --> {format_time(end_time)}\n")
f.write(f"{text.strip()}\n\n")
f"{format_time(segment['start'])} --> {format_time(segment['end'])}\n")
f.write(f"{segment['text'].strip()}\n\n")
return output_file

def process_with_chapters(self, raw_transcript, chapters):
def process_with_chapters(self, transcription_service_output, chapters):
logger.info("(whisper) Combining transcript with detected chapters...")
try:
chapters_pointer = 0
transcript_pointer = 0
result = ""
segments = transcription_service_output["segments"]
# chapters index, start time, name
# transcript start time, end time, text

while chapters_pointer < len(chapters) and transcript_pointer < len(
raw_transcript
segments
):
if (
chapters[chapters_pointer][1]
<= raw_transcript[transcript_pointer][0]
<= segments[transcript_pointer]["start"]
):
result = (
result + "\n\n## " +
chapters[chapters_pointer][2] + "\n\n"
)
chapters_pointer += 1
else:
result = result + raw_transcript[transcript_pointer][2]
result = result + segments[transcript_pointer]["text"]
transcript_pointer += 1

while transcript_pointer < len(raw_transcript):
result = result + raw_transcript[transcript_pointer][2]
while transcript_pointer < len(segments):
result = result + segments[transcript_pointer]["text"]
transcript_pointer += 1

return result
except Exception as e:
logger.error("Error combining chapters")
logger.error(e)

def process_default(self):
result = ""
for x in self.result:
result = result + x[2] + " "

return result

def construct_transcript(self, raw_transcript, chapters):
if len(chapters) > 0:
# Source has chapters, add them to transcript
return self.process_with_chapters(raw_transcript, chapters)
else:
return self.process_default(raw_transcript)
def finalize_transcript(self, transcript: Transcript):
try:
with open(transcript.transcription_service_output_file, "r") as outfile:
transcription_service_output = json.load(outfile)

has_chapters = len(transcript.source.chapters) > 0
if has_chapters:
# Source has chapters, add them to transcript
return self.process_with_chapters(transcription_service_output, transcript.source.chapters)
else:
return transcription_service_output["text"]
except Exception as e:
raise Exception(f"(whisper) Error finalizing transcript: {e}")

def transcribe(self, transcript: Transcript):
try:
raw_transcript = self.audio_to_text(transcript.audio_file)
raw_transcript_file = self.generate_srt(
raw_transcript, transcript.title, transcript.source.loc)
transcription_service_output = self.audio_to_text(
transcript.audio_file)
transcript.transcription_service_output_file = self.write_to_json_file(
transcription_service_output, transcript)
transcript_srt_file = self.generate_srt(
transcription_service_output, transcript.title, transcript.source.loc)
if self.upload:
application.upload_file_to_s3(raw_transcript_file)

transcript.result = construct_transcript(
raw_transcript, transcript.source.chapters)
application.upload_file_to_s3(transcript_srt_file)
transcript.result = self.finalize_transcript(transcript)

return transcript
except Exception as e:
Expand Down
5 changes: 4 additions & 1 deletion app/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@


class Transcript:
def __init__(self, source, test_mode=False):
def __init__(self, source, test_mode=False, metadata_file=None):
self.source = source
self.metadata_file = metadata_file
# The output generated by the transcription service
self.transcription_service_output_file = None
self.summary = None
self.test_mode = test_mode
self.logger = get_logger()
Expand Down
12 changes: 7 additions & 5 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def check_if_youtube(source: Source):
raise Exception(f"Invalid source: {e}")
try:
if source.source_file.endswith(".mp3") or source.source_file.endswith(".wav") or source.source_file.endswith(".m4a"):
return Audio(source=source)
return Audio(source=source, chapters=chapters)
if source.source_file.endswith("rss") or source.source_file.endswith(".xml"):
return RSS(source=source)

Expand All @@ -139,19 +139,21 @@ def check_if_youtube(source: Source):

def _new_transcript_from_source(self, source: Source):
"""Helper method to initialize a new Transcript from source"""
self.transcripts.append(Transcript(source, self.test_mode))

metadata_file = None
if source.preprocess:
if self.preprocessing_output is None:
# Save preprocessing output for each individual source
write_to_json(
metadata_file = utils.write_to_json(
source.to_json(),
f"{self.model_output_dir}/{source.loc}",
f"{source.title}_preprocess", is_metadata=True
f"{source.title}_metadata", is_metadata=True
)
else:
# Keep preprocessing outputs for later use
self.preprocessing_output.append(source.to_json())
# Initialize new transcript from source
self.transcripts.append(Transcript(
source=source, test_mode=self.test_mode, metadata_file=metadata_file))

def add_transcription_source(
self,
Expand Down
8 changes: 8 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ def configure_metadata_given_from_JSON(source):
"existing_entries_not_covered_by_btctranscripts/status.json", [])
metadata["excluded_media"] = [entry["media"]
for entry in excluded_media]
# transcription service output
services = ["whisper", "deepgram"]
for service in services:
key = f"{service}_output"
metadata[key] = source.get(key, None)
if metadata[key] is not None:
check_if_valid_file_path(metadata[key])

return metadata
except KeyError as e:
raise Exception(f"Parsing JSON: {e} is required")
Expand Down
Loading

0 comments on commit 5566d65

Please sign in to comment.