diff --git a/Readme.md b/Readme.md index 35d8be5..08cfac6 100644 --- a/Readme.md +++ b/Readme.md @@ -79,7 +79,7 @@ To check the version: ## Usage -`tstbtc {source_file/url} {directory}` transcribe the given source +`tstbtc transcribe {source_file/url} {directory}` transcribe the given source Suported sources: - YouTube videos @@ -120,13 +120,13 @@ from Stephan Livera's podcast and add the associated metadata, we would run eith of the below commands. The first uses short argument tags, while the second uses long argument tags. The result is the same. -- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’` -- `tstbtc Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’` +- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast -t 'OP_Vault - A New Way to HODL?' -d '2023-01-30' -T 'script' -T 'op_vault' -s 'James O’Beirne' -s 'Stephan Livera' -c ‘podcast’` +- `tstbtc transcribe Nq6WxJ0PgJ4 bitcointranscripts/stephan-livera-podcast --title 'OP_Vault - A New Way to HODL?' --date '2023-01-30' --tags 'script' --tags 'op_vault' --speakers 'James O’Beirne' --speakers 'Stephan Livera' --category ‘podcast’` You can also transcribe a remote audio/mp3 link, such as the following from Stephan Livera's podcast: ```shell mp3_link="https://anchor.fm/s/7d083a4/podcast/play/64348045/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2023-1-1%2Ff7fafb12-9441-7d85-d557-e9e5d18ab788.mp3" -tstbtc $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast' +tstbtc transcribe $mp3_link bitcointranscripts/stephan-livera-podcast --title 'SLP455 Anant Tapadia - Single Sig or Multi Sig?' --date '2023-02-01' --tags 'multisig' --speakers 'Anant Tapadia' --speakers 'Stephan Livera' --category 'podcast' ``` ## Testing diff --git a/app/transcript.py b/app/transcript.py index eb827f9..08a22b2 100644 --- a/app/transcript.py +++ b/app/transcript.py @@ -235,12 +235,14 @@ def initialize(self): class Audio(Source): - def __init__(self, source): + def __init__(self, source, description=None, chapters=[]): try: # initialize source using a base Source super().__init__(source_file=source.source_file, link=source.link, loc=source.loc, local=source.local, title=source.title, date=source.event_date, tags=source.tags, category=source.category, speakers=source.speakers, preprocess=source.preprocess) self.type = "audio" + self.description = description + self.chapters = chapters self.__config_source() except Exception as e: raise Exception(f"Error during Audio creation: {e}") @@ -299,6 +301,20 @@ def download_audio(): except Exception as e: raise Exception(f"Error processing audio file: {e}") + def to_json(self): + return { + 'type': self.type, + 'loc': self.loc, + "source_file": self.source_file, + "media": self.media, + 'title': self.title, + 'categories': self.category, + 'tags': self.tags, + 'speakers': self.speakers, + 'date': self.event_date.strftime("%Y-%m-%d"), + 'description': self.description, + 'chapters': self.chapters, + } class Video(Source): @@ -418,6 +434,20 @@ def extract_chapters_from_downloaded_video_metadata(): except Exception as e: raise Exception(f"Error processing video file: {e}") + def to_json(self): + return { + 'type': self.type, + 'loc': self.loc, + "source_file": self.source_file, + 'title': self.title, + 'categories': self.category, + 'tags': self.tags, + 'speakers': self.speakers, + 'date': self.event_date.strftime("%Y-%m-%d"), + 'chapters': self.chapters, + 'youtube': self.youtube_metadata + } + class Playlist(Source): def __init__(self, source, entries, preprocess=False): @@ -462,7 +492,7 @@ def __config_source(self): if enclosure.type in ['audio/mpeg', 'audio/wav', 'audio/x-m4a']: published_date = date(*entry.published_parsed[:3]) source = Audio(Source(enclosure.href, self.loc, self.local, entry.title, published_date, self.tags, - self.category, self.speakers, self.preprocess, link=entry.link)) + self.category, self.speakers, self.preprocess, link=entry.link), description=entry.description) self.entries.append(source) else: self.logger.warning( diff --git a/app/utils.py b/app/utils.py index ef872e9..5feeb8c 100644 --- a/app/utils.py +++ b/app/utils.py @@ -26,6 +26,11 @@ def write_to_json(json_data, output_dir, filename, add_timestamp=True): return file_path +def check_if_valid_file_path(file_path): + if not isinstance(file_path, str) or not os.path.isfile(file_path): + raise Exception(f"Not a valid file: {file_path}") + + def get_status(): """Helper method to fetch and store status.json locally""" STATUS_FILE_PATH = "status.json" # the file path for storing the status locally diff --git a/setup.py b/setup.py index 6c7db35..023ed4e 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,6 @@ ], entry_points=""" [console_scripts] - tstbtc=transcriber:add + tstbtc=transcriber:cli """, ) diff --git a/transcriber.py b/transcriber.py index 5a54e98..425f183 100644 --- a/transcriber.py +++ b/transcriber.py @@ -1,3 +1,4 @@ +import json import logging import tempfile @@ -7,15 +8,11 @@ from app.transcript import Transcript from app.transcription import Transcription from app.logging import configure_logger, get_logger +from app.utils import check_if_valid_file_path, write_to_json logger = get_logger() -@click.group() -def cli(): - pass - - def print_version(ctx, param, value): if not value or ctx.resilient_parsing: return @@ -23,6 +20,20 @@ def print_version(ctx, param, value): ctx.exit() +@click.option( + "-v", + "--version", + is_flag=True, + callback=print_version, + expose_value=False, + is_eager=True, + help="Show the application's version and exit.", +) +@click.group() +def cli(): + pass + + def print_help(ctx, param, value): if not value or ctx.resilient_parsing: return @@ -130,7 +141,7 @@ def print_help(ctx, param, value): @cli.command() @click.argument("source", nargs=1) -@click.argument("loc", nargs=1) # location in the bitcointranscripts hierarchy +@click.argument("loc", nargs=1) # location in the bitcointranscripts hierarchy # Available transcription models and services @whisper @deepgram @@ -176,16 +187,7 @@ def print_help(ctx, param, value): @model_output_dir @nocleanup @verbose_logging -@click.option( - "-v", - "--version", - is_flag=True, - callback=print_version, - expose_value=False, - is_eager=True, - help="Show the application's version and exit.", -) -def add( +def transcribe( source: str, loc: str, model: str, @@ -245,3 +247,133 @@ def add( except Exception as e: logger.error(e) logger.info(f"Exited with error, not cleaning up temp files: {tmp_dir}") + + +@cli.command() +@click.argument("json_file", nargs=1) +@whisper +@deepgram +@diarize +@summarize +@use_youtube_chapters +@open_pr +@upload_to_s3 +@save_to_markdown +@noqueue +@model_output_dir +@nocleanup +@verbose_logging +def transcribe_from_json( + json_file: str, + model: str, + chapters: bool, + deepgram: bool, + diarize: bool, + summarize: bool, + pr: bool, + upload: bool, + markdown: bool, + noqueue: bool, + model_output_dir: str, + nocleanup: bool, + verbose: bool, +): + """Supply sources in a JSON file for transcription. + The JSON can be generated by `preprocess-sources` or created manually. + """ + try: + check_if_valid_file_path(json_file) + tmp_dir = tempfile.mkdtemp() + configure_logger(logging.DEBUG if verbose else logging.INFO, tmp_dir) + logger.info(f"Adding transcripts from {json_file}") + transcription = Transcription( + model=model, + deepgram=deepgram, + chapters=chapters, + diarize=diarize, + summarize=summarize, + upload=upload, + markdown=markdown, + queue=not noqueue, + model_output_dir=model_output_dir, + nocleanup=nocleanup, + working_dir=tmp_dir + ) + + with open(json_file, 'r') as file: + sources = json.load(file) + + for source in sources: + # Configure metadata given from JSON + speakers = source.get("speakers", []) + category = source.get("categories", []) + tags = source.get("tags", []) + loc = source.get("loc", "") + youtube_metadata = source.get("youtube", None) + transcription.add_transcription_source( + source_file=source["source_file"], loc=loc, + title=source["title"], category=category, tags=tags, + speakers=speakers, date=source["date"], + youtube_metadata=youtube_metadata, + chapters=source["chapters"], link=source["media"] + ) + + transcription.start() + if nocleanup: + logger.info("Not cleaning up temp files...") + else: + transcription.clean_up() + + except Exception as e: + logger.error(e) + + +@cli.command() +@click.argument("json_file", nargs=1) +@click.option( + "--nocheck", + is_flag=True, + default=False, + help="Do not check for existing sources using btctranscripts.com/status.json", +) +def preprocess_sources(json_file, nocheck): + """Supply sources in a JSON file for preprocess. Preprocessing will fetch + all the given sources, and output them in a JSON alongside the available + metadata. The JSON can then be edited and piped to `transcribe-from-json` + """ + try: + configure_logger(log_level=logging.INFO) + check_if_valid_file_path(json_file) + transcription = Transcription() + with open(json_file, "r") as outfile: + sources = json.load(outfile) + outfile.close() + logger.info(f"Sources detected: {len(sources)}") + transcription_sources = [] + for source in sources: + logger.info(f"Preprocessing {source['title']}: {source['source']}") + # Configure metadata given from source + speakers = source.get("speakers", []) + category = source.get("categories", []) + tags = source.get("tags", []) + loc = source.get("loc", "") + excluded_media = source.get( + "existing_entries_not_covered_by_btctranscripts/status.json", []) + excluded_media = [entry["media"] for entry in excluded_media] + transcription_source = transcription.add_transcription_source( + source['source'], loc=loc, tags=tags, category=category, + speakers=speakers, nocheck=nocheck, preprocess=True, + excluded_media=excluded_media + ) + for transcription_source in transcription_source["added"]: + transcription_sources.append(transcription_source) + # Write all preprocessed sources to JSON + write_to_json([source.to_json() for source in transcription_sources], + transcription.model_output_dir, "preprocessed_sources") + except Exception as e: + logger.error(e) + logger.info(f"Exited with error") + + +if __name__ == '__main__': + cli()