From a0a4d9f49e37c2a6e851966be9ace7330c618f59 Mon Sep 17 00:00:00 2001
From: kouloumos <kouloumosa@gmail.com>
Date: Thu, 14 Dec 2023 11:38:10 +0200
Subject: [PATCH] add push-to-github functionality

replace the unused, complex and `gh` depended functionality of directly
opening a PR against master with a simpler opt-in flag that pushes a
new branch with a different commit for each resulting transcript.
The user can then open a PR against master manually.
---
 Readme.md            | 25 ++++-----------
 app/application.py   | 21 -------------
 app/transcription.py | 75 +++++++++++++++++++++++++++++++++-----------
 app/types.py         | 12 +++++++
 github.sh            | 15 ---------
 initializeRepo.sh    | 64 -------------------------------------
 test/test_audio.py   |  8 ++---
 test/test_cli.py     |  9 ------
 test/test_video.py   | 12 +++----
 transcriber.py       | 24 +++++++-------
 10 files changed, 98 insertions(+), 167 deletions(-)
 create mode 100644 app/types.py
 delete mode 100644 github.sh
 delete mode 100644 initializeRepo.sh

diff --git a/Readme.md b/Readme.md
index 28c422c..ba40e7c 100644
--- a/Readme.md
+++ b/Readme.md
@@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process:
         - Process diarization. [deepgram only]
         - Process chapters.
 4. Postprocess: Offers multiple options for further actions:
-    - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
+    - **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo.
     - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
     - **Upload**: Saves transcripts in an AWS S3 Bucket.
     - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
@@ -37,27 +37,14 @@ This transcription tool operates through a structured four-stage process:
 - To use [deepgram](https://deepgram.com/) as a transcription service,
   you must have a valid `DEEPGRAM_API_KEY` in the `.env` file.
 
+- To push the resulting transcript to GitHub you need to fork
+  [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts)
+  and then clone your fork and define your `BITCOINTRANSCRIPTS_DIR` in the `.env` file.
+
 - To push the resulting transcript to a Queuer backend, you must have a 
   valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save
   the payload in a json file using the `--noqueue` flag.
 
-- To enable us fork bitcointranscript repo and open a PR, we require you to
-  login into your GitHub account. Kindly install `GITHUB CLI` using the
-  instructions on their repo [here](https://github.com/cli/cli#installation).
-  Following the prompt, please select the below options from the prompt to
-  login:
-
-    - what account do you want to log into? `Github.com`
-
-    - what is your preferred protocol for Git operations? `SSH`
-
-    - Upload your SSH public key to your GitHub account? `skip`
-
-    - How would you like to authenticate GitHub CLI? `Login with a web browser`
-
-    - copy the generated one-time pass-code and paste in the browser to
-      authenticate if you have enabled 2FA
-
 - To enable pushing the models to a S3 bucket,
     - [Install](https://aws.amazon.com/cli/) aws-cli to your system.
     - [Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
@@ -120,7 +107,7 @@ To configure the transcription process, you can use the following flags:
 - `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram]
 - `-S` or `--summarize`: Summarize the transcript [only available with deepgram]
 - `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.
-- `-p` or `--pr`: Open a PR on the bitcointranscripts repo
+- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo
 - `-u` or `--upload`: Upload processed model files to AWS S3
 - `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts
 - `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file
diff --git a/app/application.py b/app/application.py
index a8fe2b3..6f23940 100644
--- a/app/application.py
+++ b/app/application.py
@@ -28,26 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"):
     return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3"))
 
 
-def create_pr(absolute_path, loc, username, curr_time, title):
-    logger = logging.getLogger(__app_name__)
-    branch_name = loc.replace("/", "-")
-    subprocess.call(
-        [
-            "bash",
-            "initializeRepo.sh",
-            absolute_path,
-            loc,
-            branch_name,
-            username,
-            curr_time,
-        ]
-    )
-    subprocess.call(
-        ["bash", "github.sh", branch_name, username, curr_time, title]
-    )
-    logger.info("Please check the PR for the transcription.")
-
-
 def clean_up(tmp_dir):
     try:
         shutil.rmtree(tmp_dir)
@@ -56,7 +36,6 @@ def clean_up(tmp_dir):
             raise
 
 
-
 def upload_file_to_s3(file_path):
     logger = logging.getLogger(__app_name__)
     s3 = boto3.client("s3")
diff --git a/app/transcription.py b/app/transcription.py
index 354de20..0f0fc3d 100644
--- a/app/transcription.py
+++ b/app/transcription.py
@@ -1,7 +1,10 @@
 import json
 import logging
 import os
+import shutil
+import random
 import re
+import subprocess
 import tempfile
 import time
 from datetime import datetime
@@ -22,13 +25,14 @@
 )
 from app.logging import get_logger
 from app.queuer import Queuer
+from app.types import PostprocessOutput
 
 
 class Transcription:
     def __init__(
         self,
         model="tiny",
-        pr=False,
+        github=False,
         summarize=False,
         deepgram=False,
         diarize=False,
@@ -50,8 +54,8 @@ def __init__(
         self.transcript_by = "username" if test_mode else self.__get_username()
         # during testing we need to create the markdown for validation purposes
         self.markdown = markdown or test_mode
+        self.bitcointranscripts_dir = self.__configure_target_repo(github)
         self.review_flag = self.__configure_review_flag(needs_review)
-        self.open_pr = pr
         if deepgram:
             self.service = services.Deepgram(
                 summarize, diarize, upload, model_output_dir)
@@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name):
         os.makedirs(subdir_path)
         return subdir_path
 
+    def __configure_target_repo(self, github):
+        if not github:
+            return None
+        config = dotenv_values(".env")
+        git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR")
+        if not git_repo_dir:
+            raise Exception(
+                "To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file")
+            return None
+        return git_repo_dir
+
     def __configure_review_flag(self, needs_review):
         # sanity check
         if needs_review and not self.markdown:
             raise Exception(
                 "The `--needs-review` flag is only applicable when creating a markdown")
 
-        if needs_review:
+        if needs_review or self.bitcointranscripts_dir:
             return " --needs-review"
         else:
             return ""
@@ -282,10 +297,43 @@ def start(self, test_transcript=None):
                 postprocessed_transcript = self.postprocess(transcript)
                 self.result.append(postprocessed_transcript)
 
+            if self.bitcointranscripts_dir:
+                self.push_to_github(self.result)
             return self.result
         except Exception as e:
             raise Exception(f"Error with the transcription: {e}") from e
 
+    def push_to_github(self, outputs: list[PostprocessOutput]):
+        # Change to the directory where your Git repository is located
+        os.chdir(self.bitcointranscripts_dir)
+        # Fetch the latest changes from the remote repository
+        subprocess.run(['git', 'fetch', 'origin', 'master'])
+        # Create a new branch from the fetched 'origin/master'
+        branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
+        subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master'])
+        # For each output with markdown, create a new commit in the new branch
+        for output in outputs:
+            if output.get('markdown'):
+                markdown_file = output['markdown']
+                destination_path = os.path.join(
+                    self.bitcointranscripts_dir, output["transcript"].source.loc)
+                # Ensure the markdown file exists before copying
+                if os.path.exists(markdown_file):
+                    shutil.copy(markdown_file, destination_path)
+                    markdown_file_name = os.path.basename(markdown_file)
+                    subprocess.run(['git', 'add', os.path.join(
+                        destination_path, markdown_file_name)])
+                    subprocess.run(
+                        ['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}'])
+                else:
+                    print(f"Markdown file {markdown_file} does not exist.")
+
+        # Push the branch to the remote repository
+        subprocess.run(['git', 'push', 'origin', branch_name])
+        # Delete branch locally
+        subprocess.run(['git', 'checkout', 'master'])
+        subprocess.run(['git', 'branch', '-D', branch_name])
+
     def write_to_markdown_file(self, transcript: Transcript, output_dir):
         """Writes transcript to a markdown file and returns its absolute path
         This file is the one submitted as part of the Pull Request to the
@@ -332,23 +380,15 @@ def write_to_json_file(self, transcript: Transcript):
         self.logger.info(f"Transcription stored at {json_file}")
         return json_file
 
-    def postprocess(self, transcript: Transcript):
+    def postprocess(self, transcript: Transcript) -> PostprocessOutput:
         try:
-            result = transcript.result
+            result = {}
+            result["transcript"] = transcript
             output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
-            if self.markdown:
-                transcription_md_file = self.write_to_markdown_file(
+            if self.markdown or self.bitcointranscripts_dir:
+                result["markdown"] = self.write_to_markdown_file(
                     transcript,
                     output_dir if not self.test_mode else transcript.tmp_dir)
-                result = transcription_md_file
-            if self.open_pr:
-                application.create_pr(
-                    absolute_path=transcription_md_file,
-                    loc=transcript.source.source_file,
-                    username=self.transcript_by,
-                    curr_time=str(round(time.time() * 1000)),
-                    title=transcript.title,
-                )
             elif not self.test_mode:
                 transcript_json = transcript.to_json()
                 transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
@@ -356,8 +396,7 @@ def postprocess(self, transcript: Transcript):
                     return self.queuer.push_to_queue(transcript_json)
                 else:
                     # store payload for the user to manually send it to the queuer
-                    payload_json_file = self.write_to_json_file(transcript)
-                    result = payload_json_file
+                    result["json"] = self.write_to_json_file(transcript)
             return result
         except Exception as e:
             raise Exception(f"Error with postprocessing: {e}") from e
diff --git a/app/types.py b/app/types.py
new file mode 100644
index 0000000..946cd99
--- /dev/null
+++ b/app/types.py
@@ -0,0 +1,12 @@
+from typing import (
+    TypedDict,
+    Optional
+)
+
+from app.transcript import Transcript
+
+
+class PostprocessOutput(TypedDict):
+    transcript: Transcript
+    markdown: Optional[str]
+    json: Optional[str]
diff --git a/github.sh b/github.sh
deleted file mode 100644
index d7ba73e..0000000
--- a/github.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-cd bitcointranscripts || exit
-git checkout "${3}-${1}"
-# add the transcript to the repo and commit
-git add . && git commit -m "added ${4}"
-git push origin "${3}-${1}"
-
-echo "Setting default repo to ${2}/bitcointranscripts"
-# set the default repo to the fork
-gh repo set-default "${2}"/bitcointranscripts
-
-# push the changes to the fork and create a pull request
-gh pr create --base master --title "Autogenerated ${4} by tstbtc" --body "transcribed to bitcoin transcript for ${4}" --head "${2}:${3}-${1}"
-
-echo "Done"
-
diff --git a/initializeRepo.sh b/initializeRepo.sh
deleted file mode 100644
index e5bf22b..0000000
--- a/initializeRepo.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-# Make executable with chmod +x <<filename.sh>>
-
-# check if github is logged in
-if gh auth status;  then
-  echo 'already logged into github'
-else
-  gh auth login
-fi
- #check if the repo exists
-if [ -d "./bitcointranscripts/" ]; then
-  # set the repo to the current directory
-  git pull upstream master
-  cd bitcointranscripts || exit
-else
-  # fork and clone the repo
-  gh repo fork bitcointranscripts/bitcointranscripts --clone
-  gh repo set-default "${4}"/bitcointranscripts
-
-  # set the repo to the current directory
-  cd bitcointranscripts || exit
-fi
-# check if the current branch is master else checkout master
-git_branch="$(git rev-parse --abbrev-ref HEAD)"
-if [ "${git_branch}" != "master" ]; then
-  git checkout master
-fi
-
-gh repo sync --branch master
-
-# create a new branch or checkout the branch if it exists
-if [ "$(git show-ref --quiet refs/heads/${5}-${3})" ]; then
-  git checkout "${5}-${3}"
-else
-  git checkout -b "${5}-${3}"
-fi
-
-echo "switched to branch ${5}-${3}"
-
-# check if the loc exists or not
-if [ ! -d "./${2}" ]; then
-  mkdir -p "${2}"
-fi
-
-temp=${PWD}
-
-#discover the directories
-IFS=/ read -ra dirs <<< "${2}"
-
-for item in "${dirs[@]}"
-do
-    cd "${item}" || return #tvpeter
-
-    # check if the index file exists
-    if [ ! -f ./_index.md ]; then
-      echo -e "---\ntitle: ${item}\n---\n\n{{< childpages >}}" >> _index.md
-    fi
-
-done
-
-# goto the original directory
-cd "${temp}" || return
-
-# move the transcript to the directory
-mv "${1}" "./${2}"
diff --git a/test/test_audio.py b/test/test_audio.py
index b941d1e..8f4d689 100644
--- a/test/test_audio.py
+++ b/test/test_audio.py
@@ -26,9 +26,9 @@ def test_audio_with_title():
     )
     transcription.add_transcription_source(source_file=source, title=title)
     transcripts = transcription.start()
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -57,9 +57,9 @@ def test_audio_with_all_data():
         source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
     transcripts = transcription.start()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
diff --git a/test/test_cli.py b/test/test_cli.py
index 41f85c1..1ab1484 100644
--- a/test/test_cli.py
+++ b/test/test_cli.py
@@ -13,15 +13,6 @@ def rel_path(path):
     )
 
 
-@pytest.mark.main
-def test_initialize_repo():
-    try:
-        shutil.rmtree("bitcointranscripts", ignore_errors=True)
-    except OSError as e:
-        print(f"Error occurred while removing directory: {e}")
-        assert False
-
-
 # @pytest.mark.main
 # def test_find_source_type():
 # @TODO rewwrite
diff --git a/test/test_video.py b/test/test_video.py
index 2751c10..750dc90 100644
--- a/test/test_video.py
+++ b/test/test_video.py
@@ -33,9 +33,9 @@ def test_video_with_title():
         source, title, date, tags, category, speakers)
     transcripts = transcription.start()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -65,10 +65,10 @@ def test_video_with_all_options():
     transcription.add_transcription_source(
         source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
     transcripts = transcription.start()
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
 
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -110,9 +110,9 @@ def test_video_with_chapters():
                 chapter_names.append(x.split("= ")[1].strip())
         file.close()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
diff --git a/transcriber.py b/transcriber.py
index 8f51a87..3f61b23 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -88,12 +88,11 @@ def print_help(ctx, param, value):
     default=False,
     help="Summarize the transcript [only available with deepgram]",
 )
-open_pr = click.option(
-    "-p",
-    "--PR",
+github = click.option(
+    "--github",
     is_flag=True,
     default=False,
-    help="Open a PR on the bitcointranscripts repo",
+    help="Push transcripts to a new branch on the origin bitcointranscripts repo",
 )
 upload_to_s3 = click.option(
     "-u",
@@ -195,7 +194,7 @@ def print_help(ctx, param, value):
 @add_category
 @add_loc
 # Options for configuring the transcription postprocess
-@open_pr
+@github
 @upload_to_s3
 @save_to_markdown
 @noqueue
@@ -213,7 +212,7 @@ def transcribe(
     tags: list,
     speakers: list,
     category: list,
-    pr: bool,
+    github: bool,
     deepgram: bool,
     summarize: bool,
     diarize: bool,
@@ -245,7 +244,7 @@ def transcribe(
     try:
         transcription = Transcription(
             model=model,
-            pr=pr,
+            github=github,
             summarize=summarize,
             deepgram=deepgram,
             diarize=diarize,
@@ -353,7 +352,7 @@ def preprocess(
 )
 @click.argument("metadata_json_file", nargs=1)
 # Options for configuring the transcription postprocess
-@open_pr
+@github
 @upload_to_s3
 @save_to_markdown
 @noqueue
@@ -361,7 +360,7 @@ def preprocess(
 def postprocess(
     metadata_json_file,
     service,
-    pr: bool,
+    github: bool,
     upload: bool,
     markdown: bool,
     noqueue: bool,
@@ -376,7 +375,7 @@ def postprocess(
         utils.check_if_valid_file_path(metadata_json_file)
         transcription = Transcription(
             deepgram=service == "deepgram",
-            pr=pr,
+            github=github,
             upload=upload,
             markdown=markdown,
             queue=not noqueue,
@@ -406,7 +405,10 @@ def postprocess(
             f"{service}_output"]
         transcript_to_postprocess.result = transcription.service.finalize_transcript(
             transcript_to_postprocess)
-        transcription.postprocess(transcript_to_postprocess)
+        postprocessed_transcript = transcription.postprocess(transcript_to_postprocess)
+
+        if transcription.bitcointranscripts_dir:
+            transcription.push_to_github([postprocessed_transcript])
     except Exception as e:
         logger.error(e)
         traceback.print_exc()