From a0a4d9f49e37c2a6e851966be9ace7330c618f59 Mon Sep 17 00:00:00 2001 From: kouloumos Date: Thu, 14 Dec 2023 11:38:10 +0200 Subject: [PATCH] add push-to-github functionality replace the unused, complex and `gh` depended functionality of directly opening a PR against master with a simpler opt-in flag that pushes a new branch with a different commit for each resulting transcript. The user can then open a PR against master manually. --- Readme.md | 25 ++++----------- app/application.py | 21 ------------- app/transcription.py | 75 +++++++++++++++++++++++++++++++++----------- app/types.py | 12 +++++++ github.sh | 15 --------- initializeRepo.sh | 64 ------------------------------------- test/test_audio.py | 8 ++--- test/test_cli.py | 9 ------ test/test_video.py | 12 +++---- transcriber.py | 24 +++++++------- 10 files changed, 98 insertions(+), 167 deletions(-) create mode 100644 app/types.py delete mode 100644 github.sh delete mode 100644 initializeRepo.sh diff --git a/Readme.md b/Readme.md index 28c422c..ba40e7c 100644 --- a/Readme.md +++ b/Readme.md @@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process: - Process diarization. [deepgram only] - Process chapters. 4. Postprocess: Offers multiple options for further actions: - - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript. + - **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo. - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts. - **Upload**: Saves transcripts in an AWS S3 Bucket. - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend). @@ -37,27 +37,14 @@ This transcription tool operates through a structured four-stage process: - To use [deepgram](https://deepgram.com/) as a transcription service, you must have a valid `DEEPGRAM_API_KEY` in the `.env` file. +- To push the resulting transcript to GitHub you need to fork + [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) + and then clone your fork and define your `BITCOINTRANSCRIPTS_DIR` in the `.env` file. + - To push the resulting transcript to a Queuer backend, you must have a valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save the payload in a json file using the `--noqueue` flag. -- To enable us fork bitcointranscript repo and open a PR, we require you to - login into your GitHub account. Kindly install `GITHUB CLI` using the - instructions on their repo [here](https://github.com/cli/cli#installation). - Following the prompt, please select the below options from the prompt to - login: - - - what account do you want to log into? `Github.com` - - - what is your preferred protocol for Git operations? `SSH` - - - Upload your SSH public key to your GitHub account? `skip` - - - How would you like to authenticate GitHub CLI? `Login with a web browser` - - - copy the generated one-time pass-code and paste in the browser to - authenticate if you have enabled 2FA - - To enable pushing the models to a S3 bucket, - [Install](https://aws.amazon.com/cli/) aws-cli to your system. - [Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) @@ -120,7 +107,7 @@ To configure the transcription process, you can use the following flags: - `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram] - `-S` or `--summarize`: Summarize the transcript [only available with deepgram] - `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript. -- `-p` or `--pr`: Open a PR on the bitcointranscripts repo +- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo - `-u` or `--upload`: Upload processed model files to AWS S3 - `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts - `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file diff --git a/app/application.py b/app/application.py index a8fe2b3..6f23940 100644 --- a/app/application.py +++ b/app/application.py @@ -28,26 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"): return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3")) -def create_pr(absolute_path, loc, username, curr_time, title): - logger = logging.getLogger(__app_name__) - branch_name = loc.replace("/", "-") - subprocess.call( - [ - "bash", - "initializeRepo.sh", - absolute_path, - loc, - branch_name, - username, - curr_time, - ] - ) - subprocess.call( - ["bash", "github.sh", branch_name, username, curr_time, title] - ) - logger.info("Please check the PR for the transcription.") - - def clean_up(tmp_dir): try: shutil.rmtree(tmp_dir) @@ -56,7 +36,6 @@ def clean_up(tmp_dir): raise - def upload_file_to_s3(file_path): logger = logging.getLogger(__app_name__) s3 = boto3.client("s3") diff --git a/app/transcription.py b/app/transcription.py index 354de20..0f0fc3d 100644 --- a/app/transcription.py +++ b/app/transcription.py @@ -1,7 +1,10 @@ import json import logging import os +import shutil +import random import re +import subprocess import tempfile import time from datetime import datetime @@ -22,13 +25,14 @@ ) from app.logging import get_logger from app.queuer import Queuer +from app.types import PostprocessOutput class Transcription: def __init__( self, model="tiny", - pr=False, + github=False, summarize=False, deepgram=False, diarize=False, @@ -50,8 +54,8 @@ def __init__( self.transcript_by = "username" if test_mode else self.__get_username() # during testing we need to create the markdown for validation purposes self.markdown = markdown or test_mode + self.bitcointranscripts_dir = self.__configure_target_repo(github) self.review_flag = self.__configure_review_flag(needs_review) - self.open_pr = pr if deepgram: self.service = services.Deepgram( summarize, diarize, upload, model_output_dir) @@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name): os.makedirs(subdir_path) return subdir_path + def __configure_target_repo(self, github): + if not github: + return None + config = dotenv_values(".env") + git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR") + if not git_repo_dir: + raise Exception( + "To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file") + return None + return git_repo_dir + def __configure_review_flag(self, needs_review): # sanity check if needs_review and not self.markdown: raise Exception( "The `--needs-review` flag is only applicable when creating a markdown") - if needs_review: + if needs_review or self.bitcointranscripts_dir: return " --needs-review" else: return "" @@ -282,10 +297,43 @@ def start(self, test_transcript=None): postprocessed_transcript = self.postprocess(transcript) self.result.append(postprocessed_transcript) + if self.bitcointranscripts_dir: + self.push_to_github(self.result) return self.result except Exception as e: raise Exception(f"Error with the transcription: {e}") from e + def push_to_github(self, outputs: list[PostprocessOutput]): + # Change to the directory where your Git repository is located + os.chdir(self.bitcointranscripts_dir) + # Fetch the latest changes from the remote repository + subprocess.run(['git', 'fetch', 'origin', 'master']) + # Create a new branch from the fetched 'origin/master' + branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}" + subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master']) + # For each output with markdown, create a new commit in the new branch + for output in outputs: + if output.get('markdown'): + markdown_file = output['markdown'] + destination_path = os.path.join( + self.bitcointranscripts_dir, output["transcript"].source.loc) + # Ensure the markdown file exists before copying + if os.path.exists(markdown_file): + shutil.copy(markdown_file, destination_path) + markdown_file_name = os.path.basename(markdown_file) + subprocess.run(['git', 'add', os.path.join( + destination_path, markdown_file_name)]) + subprocess.run( + ['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}']) + else: + print(f"Markdown file {markdown_file} does not exist.") + + # Push the branch to the remote repository + subprocess.run(['git', 'push', 'origin', branch_name]) + # Delete branch locally + subprocess.run(['git', 'checkout', 'master']) + subprocess.run(['git', 'branch', '-D', branch_name]) + def write_to_markdown_file(self, transcript: Transcript, output_dir): """Writes transcript to a markdown file and returns its absolute path This file is the one submitted as part of the Pull Request to the @@ -332,23 +380,15 @@ def write_to_json_file(self, transcript: Transcript): self.logger.info(f"Transcription stored at {json_file}") return json_file - def postprocess(self, transcript: Transcript): + def postprocess(self, transcript: Transcript) -> PostprocessOutput: try: - result = transcript.result + result = {} + result["transcript"] = transcript output_dir = f"{self.model_output_dir}/{transcript.source.loc}" - if self.markdown: - transcription_md_file = self.write_to_markdown_file( + if self.markdown or self.bitcointranscripts_dir: + result["markdown"] = self.write_to_markdown_file( transcript, output_dir if not self.test_mode else transcript.tmp_dir) - result = transcription_md_file - if self.open_pr: - application.create_pr( - absolute_path=transcription_md_file, - loc=transcript.source.source_file, - username=self.transcript_by, - curr_time=str(round(time.time() * 1000)), - title=transcript.title, - ) elif not self.test_mode: transcript_json = transcript.to_json() transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}" @@ -356,8 +396,7 @@ def postprocess(self, transcript: Transcript): return self.queuer.push_to_queue(transcript_json) else: # store payload for the user to manually send it to the queuer - payload_json_file = self.write_to_json_file(transcript) - result = payload_json_file + result["json"] = self.write_to_json_file(transcript) return result except Exception as e: raise Exception(f"Error with postprocessing: {e}") from e diff --git a/app/types.py b/app/types.py new file mode 100644 index 0000000..946cd99 --- /dev/null +++ b/app/types.py @@ -0,0 +1,12 @@ +from typing import ( + TypedDict, + Optional +) + +from app.transcript import Transcript + + +class PostprocessOutput(TypedDict): + transcript: Transcript + markdown: Optional[str] + json: Optional[str] diff --git a/github.sh b/github.sh deleted file mode 100644 index d7ba73e..0000000 --- a/github.sh +++ /dev/null @@ -1,15 +0,0 @@ -cd bitcointranscripts || exit -git checkout "${3}-${1}" -# add the transcript to the repo and commit -git add . && git commit -m "added ${4}" -git push origin "${3}-${1}" - -echo "Setting default repo to ${2}/bitcointranscripts" -# set the default repo to the fork -gh repo set-default "${2}"/bitcointranscripts - -# push the changes to the fork and create a pull request -gh pr create --base master --title "Autogenerated ${4} by tstbtc" --body "transcribed to bitcoin transcript for ${4}" --head "${2}:${3}-${1}" - -echo "Done" - diff --git a/initializeRepo.sh b/initializeRepo.sh deleted file mode 100644 index e5bf22b..0000000 --- a/initializeRepo.sh +++ /dev/null @@ -1,64 +0,0 @@ -# Make executable with chmod +x <> - -# check if github is logged in -if gh auth status; then - echo 'already logged into github' -else - gh auth login -fi - #check if the repo exists -if [ -d "./bitcointranscripts/" ]; then - # set the repo to the current directory - git pull upstream master - cd bitcointranscripts || exit -else - # fork and clone the repo - gh repo fork bitcointranscripts/bitcointranscripts --clone - gh repo set-default "${4}"/bitcointranscripts - - # set the repo to the current directory - cd bitcointranscripts || exit -fi -# check if the current branch is master else checkout master -git_branch="$(git rev-parse --abbrev-ref HEAD)" -if [ "${git_branch}" != "master" ]; then - git checkout master -fi - -gh repo sync --branch master - -# create a new branch or checkout the branch if it exists -if [ "$(git show-ref --quiet refs/heads/${5}-${3})" ]; then - git checkout "${5}-${3}" -else - git checkout -b "${5}-${3}" -fi - -echo "switched to branch ${5}-${3}" - -# check if the loc exists or not -if [ ! -d "./${2}" ]; then - mkdir -p "${2}" -fi - -temp=${PWD} - -#discover the directories -IFS=/ read -ra dirs <<< "${2}" - -for item in "${dirs[@]}" -do - cd "${item}" || return #tvpeter - - # check if the index file exists - if [ ! -f ./_index.md ]; then - echo -e "---\ntitle: ${item}\n---\n\n{{< childpages >}}" >> _index.md - fi - -done - -# goto the original directory -cd "${temp}" || return - -# move the transcript to the directory -mv "${1}" "./${2}" diff --git a/test/test_audio.py b/test/test_audio.py index b941d1e..8f4d689 100644 --- a/test/test_audio.py +++ b/test/test_audio.py @@ -26,9 +26,9 @@ def test_audio_with_title(): ) transcription.add_transcription_source(source_file=source, title=title) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -57,9 +57,9 @@ def test_audio_with_all_data(): source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, diff --git a/test/test_cli.py b/test/test_cli.py index 41f85c1..1ab1484 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -13,15 +13,6 @@ def rel_path(path): ) -@pytest.mark.main -def test_initialize_repo(): - try: - shutil.rmtree("bitcointranscripts", ignore_errors=True) - except OSError as e: - print(f"Error occurred while removing directory: {e}") - assert False - - # @pytest.mark.main # def test_find_source_type(): # @TODO rewwrite diff --git a/test/test_video.py b/test/test_video.py index 2751c10..750dc90 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -33,9 +33,9 @@ def test_video_with_title(): source, title, date, tags, category, speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -65,10 +65,10 @@ def test_video_with_all_options(): transcription.add_transcription_source( source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers) transcripts = transcription.start() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, @@ -110,9 +110,9 @@ def test_video_with_chapters(): chapter_names.append(x.split("= ")[1].strip()) file.close() - assert os.path.isfile(transcripts[0]) + assert os.path.isfile(transcripts[0]["markdown"]) check_md_file( - path=transcripts[0], + path=transcripts[0]["markdown"], transcript_by=username, media=source, title=title, diff --git a/transcriber.py b/transcriber.py index 8f51a87..3f61b23 100644 --- a/transcriber.py +++ b/transcriber.py @@ -88,12 +88,11 @@ def print_help(ctx, param, value): default=False, help="Summarize the transcript [only available with deepgram]", ) -open_pr = click.option( - "-p", - "--PR", +github = click.option( + "--github", is_flag=True, default=False, - help="Open a PR on the bitcointranscripts repo", + help="Push transcripts to a new branch on the origin bitcointranscripts repo", ) upload_to_s3 = click.option( "-u", @@ -195,7 +194,7 @@ def print_help(ctx, param, value): @add_category @add_loc # Options for configuring the transcription postprocess -@open_pr +@github @upload_to_s3 @save_to_markdown @noqueue @@ -213,7 +212,7 @@ def transcribe( tags: list, speakers: list, category: list, - pr: bool, + github: bool, deepgram: bool, summarize: bool, diarize: bool, @@ -245,7 +244,7 @@ def transcribe( try: transcription = Transcription( model=model, - pr=pr, + github=github, summarize=summarize, deepgram=deepgram, diarize=diarize, @@ -353,7 +352,7 @@ def preprocess( ) @click.argument("metadata_json_file", nargs=1) # Options for configuring the transcription postprocess -@open_pr +@github @upload_to_s3 @save_to_markdown @noqueue @@ -361,7 +360,7 @@ def preprocess( def postprocess( metadata_json_file, service, - pr: bool, + github: bool, upload: bool, markdown: bool, noqueue: bool, @@ -376,7 +375,7 @@ def postprocess( utils.check_if_valid_file_path(metadata_json_file) transcription = Transcription( deepgram=service == "deepgram", - pr=pr, + github=github, upload=upload, markdown=markdown, queue=not noqueue, @@ -406,7 +405,10 @@ def postprocess( f"{service}_output"] transcript_to_postprocess.result = transcription.service.finalize_transcript( transcript_to_postprocess) - transcription.postprocess(transcript_to_postprocess) + postprocessed_transcript = transcription.postprocess(transcript_to_postprocess) + + if transcription.bitcointranscripts_dir: + transcription.push_to_github([postprocessed_transcript]) except Exception as e: logger.error(e) traceback.print_exc()