diff --git a/Readme.md b/Readme.md index 28c422c..e0ff845 100644 --- a/Readme.md +++ b/Readme.md @@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process: - Process diarization. [deepgram only] - Process chapters. 4. Postprocess: Offers multiple options for further actions: - - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. + - **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo. - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts. - **Upload**: Saves transcripts in an AWS S3 Bucket. - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend). @@ -37,6 +37,9 @@ This transcription tool operates through a structured four-stage process: - To use [deepgram](https://deepgram.com/) as a transcription service, you must have a valid `DEEPGRAM_API_KEY` in the `.env` file. +- To push the resulting transcript to GitHub you need to clone your fork + and define the `BITCOINTRANSCRIPTS_DIR` in the `.env` file. + - To push the resulting transcript to a Queuer backend, you must have a valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save the payload in a json file using the `--noqueue` flag. @@ -120,7 +123,7 @@ To configure the transcription process, you can use the following flags: - `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram] - `-S` or `--summarize`: Summarize the transcript [only available with deepgram] - `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript. -- `-p` or `--pr`: Open a PR on the bitcointranscripts repo +- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo - `-u` or `--upload`: Upload processed model files to AWS S3 - `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts - `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file diff --git a/app/transcription.py b/app/transcription.py index 354de20..0f0fc3d 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -1,7 +1,10 @@ import json import logging import os +import shutil +import random import re +import subprocess import tempfile import time from datetime import datetime @@ -22,13 +25,14 @@ ) from app.logging import get_logger from app.queuer import Queuer +from app.types import PostprocessOutput class Transcription: def __init__( self, model="tiny", - pr=False, + github=False, summarize=False, deepgram=False, diarize=False, @@ -50,8 +54,8 @@ def __init__( self.transcript_by = "username" if test_mode else self.__get_username() # during testing we need to create the markdown for validation purposes self.markdown = markdown or test_mode + self.bitcointranscripts_dir = self.__configure_target_repo(github) self.review_flag = self.__configure_review_flag(needs_review) - self.open_pr = pr if deepgram: self.service = services.Deepgram( summarize, diarize, upload, model_output_dir) @@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name): os.makedirs(subdir_path) return subdir_path + def __configure_target_repo(self, github): + if not github: + return None + config = dotenv_values(".env") + git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR") + if not git_repo_dir: + raise Exception( + "To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file") + return None + return git_repo_dir + def __configure_review_flag(self, needs_review): # sanity check if needs_review and not self.markdown: raise Exception( "The `--needs-review` flag is only applicable when creating a markdown") - if needs_review: + if needs_review or self.bitcointranscripts_dir: return " --needs-review" else: return "" @@ -282,10 +297,43 @@ def start(self, test_transcript=None): postprocessed_transcript = self.postprocess(transcript) self.result.append(postprocessed_transcript) + if self.bitcointranscripts_dir: + self.push_to_github(self.result) return self.result except Exception as e: raise Exception(f"Error with the transcription: {e}") from e + def push_to_github(self, outputs: list[PostprocessOutput]): + # Change to the directory where your Git repository is located + os.chdir(self.bitcointranscripts_dir) + # Fetch the latest changes from the remote repository + subprocess.run(['git', 'fetch', 'origin', 'master']) + # Create a new branch from the fetched 'origin/master' + branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}" + subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master']) + # For each output with markdown, create a new commit in the new branch + for output in outputs: + if output.get('markdown'): + markdown_file = output['markdown'] + destination_path = os.path.join( + self.bitcointranscripts_dir, output["transcript"].source.loc) + # Ensure the markdown file exists before copying + if os.path.exists(markdown_file): + shutil.copy(markdown_file, destination_path) + markdown_file_name = os.path.basename(markdown_file) + subprocess.run(['git', 'add', os.path.join( + destination_path, markdown_file_name)]) + subprocess.run( + ['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}']) + else: + print(f"Markdown file {markdown_file} does not exist.") + + # Push the branch to the remote repository + subprocess.run(['git', 'push', 'origin', branch_name]) + # Delete branch locally + subprocess.run(['git', 'checkout', 'master']) + subprocess.run(['git', 'branch', '-D', branch_name]) + def write_to_markdown_file(self, transcript: Transcript, output_dir): """Writes transcript to a markdown file and returns its absolute path This file is the one submitted as part of the Pull Request to the @@ -332,23 +380,15 @@ def write_to_json_file(self, transcript: Transcript): self.logger.info(f"Transcription stored at {json_file}") return json_file - def postprocess(self, transcript: Transcript): + def postprocess(self, transcript: Transcript) -> PostprocessOutput: try: - result = transcript.result + result = {} + result["transcript"] = transcript output_dir = f"{self.model_output_dir}/{transcript.source.loc}" - if self.markdown: - transcription_md_file = self.write_to_markdown_file( + if self.markdown or self.bitcointranscripts_dir: + result["markdown"] = self.write_to_markdown_file( transcript, output_dir if not self.test_mode else transcript.tmp_dir) - result = transcription_md_file - if self.open_pr: - application.create_pr( - absolute_path=transcription_md_file, - loc=transcript.source.source_file, - username=self.transcript_by, - curr_time=str(round(time.time() * 1000)), - title=transcript.title, - ) elif not self.test_mode: transcript_json = transcript.to_json() transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}" @@ -356,8 +396,7 @@ def postprocess(self, transcript: Transcript): return self.queuer.push_to_queue(transcript_json) else: # store payload for the user to manually send it to the queuer - payload_json_file = self.write_to_json_file(transcript) - result = payload_json_file + result["json"] = self.write_to_json_file(transcript) return result except Exception as e: raise Exception(f"Error with postprocessing: {e}") from e diff --git a/app/types.py b/app/types.py new file mode 100644 index 0000000..946cd99 --- /dev/null +++ b/app/types.py @@ -0,0 +1,12 @@ +from typing import ( + TypedDict, + Optional +) + +from app.transcript import Transcript + + +class PostprocessOutput(TypedDict): + transcript: Transcript + markdown: Optional[str] + json: Optional[str] diff --git a/test/test_audio.py b/test/test_audio.py index b941d1e..8f4d689 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -26,9 +26,9 @@ def test_audio_with_title(): ) transcription.add_transcription_source(source_file=source, title=title) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -57,9 +57,9 @@ def test_audio_with_all_data(): source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, diff --git a/test/test_video.py b/test/test_video.py index 2751c10..750dc90 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -33,9 +33,9 @@ def test_video_with_title(): source, title, date, tags, category, speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -65,10 +65,10 @@ def test_video_with_all_options(): transcription.add_transcription_source( source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -110,9 +110,9 @@ def test_video_with_chapters(): chapter_names.append(x.split("= ")[1].strip()) file.close() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, diff --git a/transcriber.py b/transcriber.py index 8f51a87..3f61b23 100644 --- a/transcriber.py +++ b/transcriber.py @@ -88,12 +88,11 @@ def print_help(ctx, param, value): default=False, help="Summarize the transcript [only available with deepgram]", ) -open_pr = click.option( - "-p", - "--PR", +github = click.option( + "--github", is_flag=True, default=False, - help="Open a PR on the bitcointranscripts repo", + help="Push transcripts to a new branch on the origin bitcointranscripts repo", ) upload_to_s3 = click.option( "-u", @@ -195,7 +194,7 @@ def print_help(ctx, param, value): @add_category @add_loc # Options for configuring the transcription postprocess -@open_pr +@github @upload_to_s3 @save_to_markdown @noqueue @@ -213,7 +212,7 @@ def transcribe( tags: list, speakers: list, category: list, - pr: bool, + github: bool, deepgram: bool, summarize: bool, diarize: bool, @@ -245,7 +244,7 @@ def transcribe( try: transcription = Transcription( model=model, - pr=pr, + github=github, summarize=summarize, deepgram=deepgram, diarize=diarize, @@ -353,7 +352,7 @@ def preprocess( ) @click.argument("metadata_json_file", nargs=1) # Options for configuring the transcription postprocess -@open_pr +@github @upload_to_s3 @save_to_markdown @noqueue @@ -361,7 +360,7 @@ def preprocess( def postprocess( metadata_json_file, service, - pr: bool, + github: bool, upload: bool, markdown: bool, noqueue: bool, @@ -376,7 +375,7 @@ def postprocess( utils.check_if_valid_file_path(metadata_json_file) transcription = Transcription( deepgram=service == "deepgram", - pr=pr, + github=github, upload=upload, markdown=markdown, queue=not noqueue, @@ -406,7 +405,10 @@ def postprocess( f"{service}_output"] transcript_to_postprocess.result = transcription.service.finalize_transcript( transcript_to_postprocess) - transcription.postprocess(transcript_to_postprocess) + postprocessed_transcript = transcription.postprocess(transcript_to_postprocess) + + if transcription.bitcointranscripts_dir: + transcription.push_to_github([postprocessed_transcript]) except Exception as e: logger.error(e) traceback.print_exc()