add push-to-github functionality

replace the unused, complex and `gh` depended functionality of directly opening a PR against master with a simpler opt-in flag that pushes a new branch with each resulting transcript in a different commit. The user can then open a PR against master manually.
bitcointranscripts · Dec 14, 2023 · 8d66953 · 8d66953
1 parent 5fcb9f9
commit 8d66953
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 41 deletions.
diff --git a/Readme.md b/Readme.md
@@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process:
         - Process diarization. [deepgram only]
         - Process chapters.
 4. Postprocess: Offers multiple options for further actions:
-    - **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
+    - **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo.
     - **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
     - **Upload**: Saves transcripts in an AWS S3 Bucket.
     - **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
@@ -37,6 +37,9 @@ This transcription tool operates through a structured four-stage process:
 - To use [deepgram](https://deepgram.com/) as a transcription service,
   you must have a valid `DEEPGRAM_API_KEY` in the `.env` file.
 
+- To push the resulting transcript to GitHub you need to clone your fork
+  and define the `BITCOINTRANSCRIPTS_DIR` in the `.env` file.
+
 - To push the resulting transcript to a Queuer backend, you must have a 
   valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save
   the payload in a json file using the `--noqueue` flag.
@@ -120,7 +123,7 @@ To configure the transcription process, you can use the following flags:
 - `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram]
 - `-S` or `--summarize`: Summarize the transcript [only available with deepgram]
 - `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.
-- `-p` or `--pr`: Open a PR on the bitcointranscripts repo
+- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo
 - `-u` or `--upload`: Upload processed model files to AWS S3
 - `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts
 - `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file

diff --git a/app/transcription.py b/app/transcription.py
@@ -1,7 +1,10 @@
 import json
 import logging
 import os
+import shutil
+import random
 import re
+import subprocess
 import tempfile
 import time
 from datetime import datetime
@@ -22,13 +25,14 @@
 )
 from app.logging import get_logger
 from app.queuer import Queuer
+from app.types import PostprocessOutput
 
 
 class Transcription:
     def __init__(
         self,
         model="tiny",
-        pr=False,
+        github=False,
         summarize=False,
         deepgram=False,
         diarize=False,
@@ -50,8 +54,8 @@ def __init__(
         self.transcript_by = "username" if test_mode else self.__get_username()
         # during testing we need to create the markdown for validation purposes
         self.markdown = markdown or test_mode
+        self.bitcointranscripts_dir = self.__configure_target_repo(github)
         self.review_flag = self.__configure_review_flag(needs_review)
-        self.open_pr = pr
         if deepgram:
             self.service = services.Deepgram(
                 summarize, diarize, upload, model_output_dir)
@@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name):
         os.makedirs(subdir_path)
         return subdir_path
 
+    def __configure_target_repo(self, github):
+        if not github:
+            return None
+        config = dotenv_values(".env")
+        git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR")
+        if not git_repo_dir:
+            raise Exception(
+                "To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file")
+            return None
+        return git_repo_dir
+
     def __configure_review_flag(self, needs_review):
         # sanity check
         if needs_review and not self.markdown:
             raise Exception(
                 "The `--needs-review` flag is only applicable when creating a markdown")
 
-        if needs_review:
+        if needs_review or self.bitcointranscripts_dir:
             return " --needs-review"
         else:
             return ""
@@ -282,10 +297,43 @@ def start(self, test_transcript=None):
                 postprocessed_transcript = self.postprocess(transcript)
                 self.result.append(postprocessed_transcript)
 
+            if self.bitcointranscripts_dir:
+                self.push_to_github(self.result)
             return self.result
         except Exception as e:
             raise Exception(f"Error with the transcription: {e}") from e
 
+    def push_to_github(self, outputs: list[PostprocessOutput]):
+        # Change to the directory where your Git repository is located
+        os.chdir(self.bitcointranscripts_dir)
+        # Fetch the latest changes from the remote repository
+        subprocess.run(['git', 'fetch', 'origin', 'master'])
+        # Create a new branch from the fetched 'origin/master'
+        branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
+        subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master'])
+        # For each output with markdown, create a new commit in the new branch
+        for output in outputs:
+            if output.get('markdown'):
+                markdown_file = output['markdown']
+                destination_path = os.path.join(
+                    self.bitcointranscripts_dir, output["transcript"].source.loc)
+                # Ensure the markdown file exists before copying
+                if os.path.exists(markdown_file):
+                    shutil.copy(markdown_file, destination_path)
+                    markdown_file_name = os.path.basename(markdown_file)
+                    subprocess.run(['git', 'add', os.path.join(
+                        destination_path, markdown_file_name)])
+                    subprocess.run(
+                        ['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}'])
+                else:
+                    print(f"Markdown file {markdown_file} does not exist.")
+
+        # Push the branch to the remote repository
+        subprocess.run(['git', 'push', 'origin', branch_name])
+        # Delete branch locally
+        subprocess.run(['git', 'checkout', 'master'])
+        subprocess.run(['git', 'branch', '-D', branch_name])
+
     def write_to_markdown_file(self, transcript: Transcript, output_dir):
         """Writes transcript to a markdown file and returns its absolute path
         This file is the one submitted as part of the Pull Request to the
@@ -332,32 +380,23 @@ def write_to_json_file(self, transcript: Transcript):
         self.logger.info(f"Transcription stored at {json_file}")
         return json_file
 
-    def postprocess(self, transcript: Transcript):
+    def postprocess(self, transcript: Transcript) -> PostprocessOutput:
         try:
-            result = transcript.result
+            result = {}
+            result["transcript"] = transcript
             output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
-            if self.markdown:
-                transcription_md_file = self.write_to_markdown_file(
+            if self.markdown or self.bitcointranscripts_dir:
+                result["markdown"] = self.write_to_markdown_file(
                     transcript,
                     output_dir if not self.test_mode else transcript.tmp_dir)
-                result = transcription_md_file
-            if self.open_pr:
-                application.create_pr(
-                    absolute_path=transcription_md_file,
-                    loc=transcript.source.source_file,
-                    username=self.transcript_by,
-                    curr_time=str(round(time.time() * 1000)),
-                    title=transcript.title,
-                )
             elif not self.test_mode:
                 transcript_json = transcript.to_json()
                 transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
                 if self.queuer:
                     return self.queuer.push_to_queue(transcript_json)
                 else:
                     # store payload for the user to manually send it to the queuer
-                    payload_json_file = self.write_to_json_file(transcript)
-                    result = payload_json_file
+                    result["json"] = self.write_to_json_file(transcript)
             return result
         except Exception as e:
             raise Exception(f"Error with postprocessing: {e}") from e

diff --git a/app/types.py b/app/types.py
@@ -0,0 +1,12 @@
+from typing import (
+    TypedDict,
+    Optional
+)
+
+from app.transcript import Transcript
+
+
+class PostprocessOutput(TypedDict):
+    transcript: Transcript
+    markdown: Optional[str]
+    json: Optional[str]
diff --git a/test/test_audio.py b/test/test_audio.py
@@ -26,9 +26,9 @@ def test_audio_with_title():
     )
     transcription.add_transcription_source(source_file=source, title=title)
     transcripts = transcription.start()
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -57,9 +57,9 @@ def test_audio_with_all_data():
         source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
     transcripts = transcription.start()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,

diff --git a/test/test_video.py b/test/test_video.py
@@ -33,9 +33,9 @@ def test_video_with_title():
         source, title, date, tags, category, speakers)
     transcripts = transcription.start()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -65,10 +65,10 @@ def test_video_with_all_options():
     transcription.add_transcription_source(
         source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
     transcripts = transcription.start()
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
 
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,
@@ -110,9 +110,9 @@ def test_video_with_chapters():
                 chapter_names.append(x.split("= ")[1].strip())
         file.close()
 
-    assert os.path.isfile(transcripts[0])
+    assert os.path.isfile(transcripts[0]["markdown"])
     check_md_file(
-        path=transcripts[0],
+        path=transcripts[0]["markdown"],
         transcript_by=username,
         media=source,
         title=title,

diff --git a/transcriber.py b/transcriber.py
@@ -88,12 +88,11 @@ def print_help(ctx, param, value):
     default=False,
     help="Summarize the transcript [only available with deepgram]",
 )
-open_pr = click.option(
-    "-p",
-    "--PR",
+github = click.option(
+    "--github",
     is_flag=True,
     default=False,
-    help="Open a PR on the bitcointranscripts repo",
+    help="Push transcripts to a new branch on the origin bitcointranscripts repo",
 )
 upload_to_s3 = click.option(
     "-u",
@@ -195,7 +194,7 @@ def print_help(ctx, param, value):
 @add_category
 @add_loc
 # Options for configuring the transcription postprocess
-@open_pr
+@github
 @upload_to_s3
 @save_to_markdown
 @noqueue
@@ -213,7 +212,7 @@ def transcribe(
     tags: list,
     speakers: list,
     category: list,
-    pr: bool,
+    github: bool,
     deepgram: bool,
     summarize: bool,
     diarize: bool,
@@ -245,7 +244,7 @@ def transcribe(
     try:
         transcription = Transcription(
             model=model,
-            pr=pr,
+            github=github,
             summarize=summarize,
             deepgram=deepgram,
             diarize=diarize,
@@ -353,15 +352,15 @@ def preprocess(
 )
 @click.argument("metadata_json_file", nargs=1)
 # Options for configuring the transcription postprocess
-@open_pr
+@github
 @upload_to_s3
 @save_to_markdown
 @noqueue
 @needs_review
 def postprocess(
     metadata_json_file,
     service,
-    pr: bool,
+    github: bool,
     upload: bool,
     markdown: bool,
     noqueue: bool,
@@ -376,7 +375,7 @@ def postprocess(
         utils.check_if_valid_file_path(metadata_json_file)
         transcription = Transcription(
             deepgram=service == "deepgram",
-            pr=pr,
+            github=github,
             upload=upload,
             markdown=markdown,
             queue=not noqueue,
@@ -406,7 +405,10 @@ def postprocess(
             f"{service}_output"]
         transcript_to_postprocess.result = transcription.service.finalize_transcript(
             transcript_to_postprocess)
-        transcription.postprocess(transcript_to_postprocess)
+        postprocessed_transcript = transcription.postprocess(transcript_to_postprocess)
+
+        if transcription.bitcointranscripts_dir:
+            transcription.push_to_github([postprocessed_transcript])
     except Exception as e:
         logger.error(e)
         traceback.print_exc()