Skip to content

Commit

Permalink
cli: replace preprocess-sources with preprocess
Browse files Browse the repository at this point in the history
preprocessing now also supports individual sources as input following
the same pattern as the `transcribe` command
  • Loading branch information
kouloumos committed Dec 5, 2023
1 parent 2fc03a7 commit 37c086e
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 48 deletions.
23 changes: 14 additions & 9 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@ This cli app transcribes audio and video for submission to the [bitcointranscrip
- summarization `--summarize`
- diarization `--diarize`

**Features**:

- Transcription using [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/)
- Collection of video's metadata when sourcing from YouTube.
- Open Pull Request on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
- Save the resulting transcript to a markdown format supported by bitcointranscripts.
- Upload the resulting transcript to an AWS S3 Bucket
repo.
- Push the resulting transcript to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend), or save the payload in a json for later use.
**Transcription Workflow**

This transcription tool operates through a structured four-stage process:

1. Preprocess: Gathers all the available metadata for each source (supports YouTube videos&playlists, and RSS feeds)
2. Process: Downloads and converts sources for transcription preparation
3. Transcription: Utilizes [`openai-whisper`](https://github.com/openai/whisper) or [Deepgram](https://deepgram.com/) to generate transcripts from MP3 files.
4. Postprocess: Offers multiple options for further actions:
- **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
- **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
- **Upload**: Saves transcripts in an AWS S3 Bucket.
- **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
- **Save as JSON**: Preserves transcripts for future use.


## Prerequisites

Expand Down
47 changes: 36 additions & 11 deletions app/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,27 @@


class Transcription:
def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deepgram=False, diarize=False, upload=False, model_output_dir="local_models/", nocleanup=False, queue=True, markdown=False, username=None, test_mode=False, working_dir=None):
def __init__(
self,
model="tiny",
chapters=False,
pr=False,
summarize=False,
deepgram=False,
diarize=False,
upload=False,
model_output_dir="local_models/",
nocleanup=False,
queue=True,
markdown=False,
username=None,
test_mode=False,
working_dir=None,
batch_preprocessing_output=False
):
self.logger = get_logger()
self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()

self.model = model
self.transcript_by = "username" if test_mode else self.__get_username()
self.generate_chapters = chapters
Expand All @@ -44,8 +64,7 @@ def __init__(self, model="tiny", chapters=False, pr=False, summarize=False, deep
self.markdown = markdown or test_mode
self.existing_media = None
self.test_mode = test_mode
self.logger = get_logger()
self.tmp_dir = working_dir if working_dir is not None else tempfile.mkdtemp()
self.preprocessing_output = [] if batch_preprocessing_output else None

self.logger.info(f"Temp directory: {self.tmp_dir}")

Expand Down Expand Up @@ -124,13 +143,18 @@ def check_if_youtube(source: Source):
def _new_transcript_from_source(self, source: Source):
"""Helper method to initialize a new Transcript from source"""
self.transcripts.append(Transcript(source, self.test_mode))
# Save preprocessed source

if source.preprocess:
write_to_json(
source.to_json(),
f"{self.model_output_dir}/{source.loc}",
f"{source.title}_preprocess", is_metadata=True
)
if self.preprocessing_output is None:
# Save preprocessing output for each individual source
write_to_json(
source.to_json(),
f"{self.model_output_dir}/{source.loc}",
f"{source.title}_preprocess", is_metadata=True
)
else:
# Keep preprocessing outputs for later use
self.preprocessing_output.append(source.to_json())

def add_transcription_source(self, source_file, loc="misc", title=None, date=None, tags=[], category=[], speakers=[], preprocess=True, youtube_metadata=None, link=None, chapters=None, nocheck=False, excluded_media=[]):
"""Add a source for transcription"""
Expand Down Expand Up @@ -185,7 +209,7 @@ def add_transcription_source(self, source_file, loc="misc", title=None, date=Non
f"{source.title}: sources added for transcription: {len(transcription_sources['added'])} (Ignored: {len(transcription_sources['exist'])} sources)")
return transcription_sources

def add_transcription_source_JSON(self, json_file):
def add_transcription_source_JSON(self, json_file, nocheck=False):
# validation checks
check_if_valid_file_path(json_file)
sources = check_if_valid_json(json_file)
Expand All @@ -210,7 +234,8 @@ def add_transcription_source_JSON(self, json_file):
youtube_metadata=metadata["youtube_metadata"],
chapters=metadata["chapters"],
link=metadata["media"],
excluded_media=metadata["excluded_media"]
excluded_media=metadata["excluded_media"],
nocheck=nocheck
)

def start(self, test_transcript=None):
Expand Down
80 changes: 52 additions & 28 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,44 +269,68 @@ def transcribe(


@cli.command()
@click.argument("json_file", nargs=1)
@click.argument("source", nargs=1)
@click.option(
"--nocheck",
is_flag=True,
default=False,
help="Do not check for existing sources using btctranscripts.com/status.json",
)
def preprocess_sources(json_file, nocheck):
"""Supply sources in a JSON file for preprocess. Preprocessing will fetch
all the given sources, and output them in a JSON alongside the available
metadata. The JSON can then be edited and piped to `transcribe-from-json`
@click.option(
"--no-batched-output",
is_flag=True,
default=False,
help="Output preprocessing output in a different JSON file for each source",
)
# Options for adding metadata
@add_title
@add_date
@add_tags
@add_speakers
@add_category
@add_loc
def preprocess(
source: str,
loc: str,
title: str,
date: str,
tags: list,
speakers: list,
category: list,
nocheck: bool,
no_batched_output: bool
):
"""Preprocess the provided sources. Suported sources include: \n
- YouTube videos and playlists\n
- JSON files containing individual sources\n
Preprocessing will fetch all the given sources, and output them
in a JSON alongside the available metadata.
The JSON can then be edited and piped to `transcribe`
"""
try:
configure_logger(log_level=logging.INFO)
check_if_valid_file_path(json_file)
transcription = Transcription()
with open(json_file, "r") as outfile:
sources = json.load(outfile)
outfile.close()
logger.info(f"Sources detected: {len(sources)}")
transcription_sources = []
for source in sources:
logger.info(f"Preprocessing {source['title']}: {source['source']}")
metadata = configure_metadata_given_from_JSON(source)
excluded_media = source.get(
"existing_entries_not_covered_by_btctranscripts/status.json", [])
excluded_media = [entry["media"] for entry in excluded_media]
transcription_source = transcription.add_transcription_source(
metadata['source_file'], loc=metadata['loc'],
tags=metadata['tags'], category=metadata['category'],
speakers=metadata['speakers'], nocheck=nocheck,
preprocess=True, excluded_media=excluded_media
logger.info(f"Preprocessing sources...")
transcription = Transcription(
batch_preprocessing_output=not no_batched_output)
if source.endswith(".json"):
transcription.add_transcription_source_JSON(source, nocheck=nocheck)
else:
transcription.add_transcription_source(
source_file=source,
loc=loc,
title=title,
date=date,
tags=tags,
category=category,
speakers=speakers,
preprocess=True,
nocheck=nocheck
)
for transcription_source in transcription_source["added"]:
transcription_sources.append(transcription_source)
# Write all preprocessed sources to JSON
write_to_json([source.to_json() for source in transcription_sources],
transcription.model_output_dir, "preprocessed_sources")
if not no_batched_output:
# Batch write all preprocessed sources to JSON
write_to_json([preprocessed_source for preprocessed_source in transcription.preprocessing_output],
transcription.model_output_dir, "preprocessed_sources")
except Exception as e:
logger.error(e)
logger.info(f"Exited with error")
Expand Down

0 comments on commit 37c086e

Please sign in to comment.