Skip to content

Commit

Permalink
add push-to-github functionality
Browse files Browse the repository at this point in the history
replace the unused, complex and `gh` depended functionality of directly
opening a PR against master with a simpler opt-in flag that pushes a
new branch with a different commit for each resulting transcript.
The user can then open a PR against master manually.
  • Loading branch information
kouloumos committed Dec 14, 2023
1 parent 5fcb9f9 commit a0a4d9f
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 167 deletions.
25 changes: 6 additions & 19 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process:
- Process diarization. [deepgram only]
- Process chapters.
4. Postprocess: Offers multiple options for further actions:
- **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
- **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo.
- **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
- **Upload**: Saves transcripts in an AWS S3 Bucket.
- **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
Expand All @@ -37,27 +37,14 @@ This transcription tool operates through a structured four-stage process:
- To use [deepgram](https://deepgram.com/) as a transcription service,
you must have a valid `DEEPGRAM_API_KEY` in the `.env` file.

- To push the resulting transcript to GitHub you need to fork
[bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts)
and then clone your fork and define your `BITCOINTRANSCRIPTS_DIR` in the `.env` file.

- To push the resulting transcript to a Queuer backend, you must have a
valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save
the payload in a json file using the `--noqueue` flag.

- To enable us fork bitcointranscript repo and open a PR, we require you to
login into your GitHub account. Kindly install `GITHUB CLI` using the
instructions on their repo [here](https://github.com/cli/cli#installation).
Following the prompt, please select the below options from the prompt to
login:

- what account do you want to log into? `Github.com`

- what is your preferred protocol for Git operations? `SSH`

- Upload your SSH public key to your GitHub account? `skip`

- How would you like to authenticate GitHub CLI? `Login with a web browser`

- copy the generated one-time pass-code and paste in the browser to
authenticate if you have enabled 2FA

- To enable pushing the models to a S3 bucket,
- [Install](https://aws.amazon.com/cli/) aws-cli to your system.
- [Configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
Expand Down Expand Up @@ -120,7 +107,7 @@ To configure the transcription process, you can use the following flags:
- `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram]
- `-S` or `--summarize`: Summarize the transcript [only available with deepgram]
- `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.
- `-p` or `--pr`: Open a PR on the bitcointranscripts repo
- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo
- `-u` or `--upload`: Upload processed model files to AWS S3
- `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts
- `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file
Expand Down
21 changes: 0 additions & 21 deletions app/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,6 @@ def convert_wav_to_mp3(abs_path, filename, working_dir="tmp/"):
return os.path.abspath(os.path.join(working_dir, filename[:-4] + ".mp3"))


def create_pr(absolute_path, loc, username, curr_time, title):
logger = logging.getLogger(__app_name__)
branch_name = loc.replace("/", "-")
subprocess.call(
[
"bash",
"initializeRepo.sh",
absolute_path,
loc,
branch_name,
username,
curr_time,
]
)
subprocess.call(
["bash", "github.sh", branch_name, username, curr_time, title]
)
logger.info("Please check the PR for the transcription.")


def clean_up(tmp_dir):
try:
shutil.rmtree(tmp_dir)
Expand All @@ -56,7 +36,6 @@ def clean_up(tmp_dir):
raise



def upload_file_to_s3(file_path):
logger = logging.getLogger(__app_name__)
s3 = boto3.client("s3")
Expand Down
75 changes: 57 additions & 18 deletions app/transcription.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import logging
import os
import shutil
import random
import re
import subprocess
import tempfile
import time
from datetime import datetime
Expand All @@ -22,13 +25,14 @@
)
from app.logging import get_logger
from app.queuer import Queuer
from app.types import PostprocessOutput


class Transcription:
def __init__(
self,
model="tiny",
pr=False,
github=False,
summarize=False,
deepgram=False,
diarize=False,
Expand All @@ -50,8 +54,8 @@ def __init__(
self.transcript_by = "username" if test_mode else self.__get_username()
# during testing we need to create the markdown for validation purposes
self.markdown = markdown or test_mode
self.bitcointranscripts_dir = self.__configure_target_repo(github)
self.review_flag = self.__configure_review_flag(needs_review)
self.open_pr = pr
if deepgram:
self.service = services.Deepgram(
summarize, diarize, upload, model_output_dir)
Expand All @@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name):
os.makedirs(subdir_path)
return subdir_path

def __configure_target_repo(self, github):
if not github:
return None
config = dotenv_values(".env")
git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR")
if not git_repo_dir:
raise Exception(
"To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file")
return None
return git_repo_dir

def __configure_review_flag(self, needs_review):
# sanity check
if needs_review and not self.markdown:
raise Exception(
"The `--needs-review` flag is only applicable when creating a markdown")

if needs_review:
if needs_review or self.bitcointranscripts_dir:
return " --needs-review"
else:
return ""
Expand Down Expand Up @@ -282,10 +297,43 @@ def start(self, test_transcript=None):
postprocessed_transcript = self.postprocess(transcript)
self.result.append(postprocessed_transcript)

if self.bitcointranscripts_dir:
self.push_to_github(self.result)
return self.result
except Exception as e:
raise Exception(f"Error with the transcription: {e}") from e

def push_to_github(self, outputs: list[PostprocessOutput]):
# Change to the directory where your Git repository is located
os.chdir(self.bitcointranscripts_dir)
# Fetch the latest changes from the remote repository
subprocess.run(['git', 'fetch', 'origin', 'master'])
# Create a new branch from the fetched 'origin/master'
branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master'])
# For each output with markdown, create a new commit in the new branch
for output in outputs:
if output.get('markdown'):
markdown_file = output['markdown']
destination_path = os.path.join(
self.bitcointranscripts_dir, output["transcript"].source.loc)
# Ensure the markdown file exists before copying
if os.path.exists(markdown_file):
shutil.copy(markdown_file, destination_path)
markdown_file_name = os.path.basename(markdown_file)
subprocess.run(['git', 'add', os.path.join(
destination_path, markdown_file_name)])
subprocess.run(
['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}'])
else:
print(f"Markdown file {markdown_file} does not exist.")

# Push the branch to the remote repository
subprocess.run(['git', 'push', 'origin', branch_name])
# Delete branch locally
subprocess.run(['git', 'checkout', 'master'])
subprocess.run(['git', 'branch', '-D', branch_name])

def write_to_markdown_file(self, transcript: Transcript, output_dir):
"""Writes transcript to a markdown file and returns its absolute path
This file is the one submitted as part of the Pull Request to the
Expand Down Expand Up @@ -332,32 +380,23 @@ def write_to_json_file(self, transcript: Transcript):
self.logger.info(f"Transcription stored at {json_file}")
return json_file

def postprocess(self, transcript: Transcript):
def postprocess(self, transcript: Transcript) -> PostprocessOutput:
try:
result = transcript.result
result = {}
result["transcript"] = transcript
output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
if self.markdown:
transcription_md_file = self.write_to_markdown_file(
if self.markdown or self.bitcointranscripts_dir:
result["markdown"] = self.write_to_markdown_file(
transcript,
output_dir if not self.test_mode else transcript.tmp_dir)
result = transcription_md_file
if self.open_pr:
application.create_pr(
absolute_path=transcription_md_file,
loc=transcript.source.source_file,
username=self.transcript_by,
curr_time=str(round(time.time() * 1000)),
title=transcript.title,
)
elif not self.test_mode:
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
if self.queuer:
return self.queuer.push_to_queue(transcript_json)
else:
# store payload for the user to manually send it to the queuer
payload_json_file = self.write_to_json_file(transcript)
result = payload_json_file
result["json"] = self.write_to_json_file(transcript)
return result
except Exception as e:
raise Exception(f"Error with postprocessing: {e}") from e
Expand Down
12 changes: 12 additions & 0 deletions app/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import (
TypedDict,
Optional
)

from app.transcript import Transcript


class PostprocessOutput(TypedDict):
transcript: Transcript
markdown: Optional[str]
json: Optional[str]
15 changes: 0 additions & 15 deletions github.sh

This file was deleted.

64 changes: 0 additions & 64 deletions initializeRepo.sh

This file was deleted.

8 changes: 4 additions & 4 deletions test/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_audio_with_title():
)
transcription.add_transcription_source(source_file=source, title=title)
transcripts = transcription.start()
assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down Expand Up @@ -57,9 +57,9 @@ def test_audio_with_all_data():
source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
transcripts = transcription.start()

assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down
9 changes: 0 additions & 9 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ def rel_path(path):
)


@pytest.mark.main
def test_initialize_repo():
try:
shutil.rmtree("bitcointranscripts", ignore_errors=True)
except OSError as e:
print(f"Error occurred while removing directory: {e}")
assert False


# @pytest.mark.main
# def test_find_source_type():
# @TODO rewwrite
Expand Down
Loading

0 comments on commit a0a4d9f

Please sign in to comment.