Skip to content

Commit

Permalink
add push-to-github functionality
Browse files Browse the repository at this point in the history
replace the unused, complex and `gh` depended functionality of directly
opening a PR against master with a simpler opt-in flag that pushes a
new branch with each resulting transcript in a different commit.
The user can then open a PR against master manually.
  • Loading branch information
kouloumos committed Dec 14, 2023
1 parent 5fcb9f9 commit 8d66953
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 41 deletions.
7 changes: 5 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ This transcription tool operates through a structured four-stage process:
- Process diarization. [deepgram only]
- Process chapters.
4. Postprocess: Offers multiple options for further actions:
- **Pull Request**: Opens a PR on the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo for the resulting transcript.
- **Push to GitHub**: Push transcripts to your fork of the [bitcointranscripts](https://github.com/bitcointranscripts/bitcointranscripts) repo.
- **Markdown**: Saves transcripts in a markdown format supported by bitcointranscripts.
- **Upload**: Saves transcripts in an AWS S3 Bucket.
- **Push to Queuer backend**: Sends transcripts to [a Queuer backend](https://github.com/bitcointranscripts/transcription-review-backend).
Expand All @@ -37,6 +37,9 @@ This transcription tool operates through a structured four-stage process:
- To use [deepgram](https://deepgram.com/) as a transcription service,
you must have a valid `DEEPGRAM_API_KEY` in the `.env` file.

- To push the resulting transcript to GitHub you need to clone your fork
and define the `BITCOINTRANSCRIPTS_DIR` in the `.env` file.

- To push the resulting transcript to a Queuer backend, you must have a
valid `QUEUE_ENDPOINT` in the `.env` file. If not, you can instead save
the payload in a json file using the `--noqueue` flag.
Expand Down Expand Up @@ -120,7 +123,7 @@ To configure the transcription process, you can use the following flags:
- `-M` or `--diarize`: Supply this flag if you have multiple speakers AKA want to diarize the content [only available with deepgram]
- `-S` or `--summarize`: Summarize the transcript [only available with deepgram]
- `-C` or `--chapters`: For YouTube videos, include the YouTube chapters and timestamps in the resulting transcript.
- `-p` or `--pr`: Open a PR on the bitcointranscripts repo
- `--github`: Push transcripts to a new branch on the origin bitcointranscripts repo
- `-u` or `--upload`: Upload processed model files to AWS S3
- `--markdown`: Save the resulting transcript to a markdown format supported by bitcointranscripts
- `--noqueue`: Do not push the resulting transcript to the Queuer, instead store the payload in a json file
Expand Down
75 changes: 57 additions & 18 deletions app/transcription.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import logging
import os
import shutil
import random
import re
import subprocess
import tempfile
import time
from datetime import datetime
Expand All @@ -22,13 +25,14 @@
)
from app.logging import get_logger
from app.queuer import Queuer
from app.types import PostprocessOutput


class Transcription:
def __init__(
self,
model="tiny",
pr=False,
github=False,
summarize=False,
deepgram=False,
diarize=False,
Expand All @@ -50,8 +54,8 @@ def __init__(
self.transcript_by = "username" if test_mode else self.__get_username()
# during testing we need to create the markdown for validation purposes
self.markdown = markdown or test_mode
self.bitcointranscripts_dir = self.__configure_target_repo(github)
self.review_flag = self.__configure_review_flag(needs_review)
self.open_pr = pr
if deepgram:
self.service = services.Deepgram(
summarize, diarize, upload, model_output_dir)
Expand All @@ -73,13 +77,24 @@ def _create_subdirectory(self, subdir_name):
os.makedirs(subdir_path)
return subdir_path

def __configure_target_repo(self, github):
if not github:
return None
config = dotenv_values(".env")
git_repo_dir = config.get("BITCOINTRANSCRIPTS_DIR")
if not git_repo_dir:
raise Exception(
"To push to GitHub you need to define a 'BITCOINTRANSCRIPTS_DIR' in your .env file")
return None
return git_repo_dir

def __configure_review_flag(self, needs_review):
# sanity check
if needs_review and not self.markdown:
raise Exception(
"The `--needs-review` flag is only applicable when creating a markdown")

if needs_review:
if needs_review or self.bitcointranscripts_dir:
return " --needs-review"
else:
return ""
Expand Down Expand Up @@ -282,10 +297,43 @@ def start(self, test_transcript=None):
postprocessed_transcript = self.postprocess(transcript)
self.result.append(postprocessed_transcript)

if self.bitcointranscripts_dir:
self.push_to_github(self.result)
return self.result
except Exception as e:
raise Exception(f"Error with the transcription: {e}") from e

def push_to_github(self, outputs: list[PostprocessOutput]):
# Change to the directory where your Git repository is located
os.chdir(self.bitcointranscripts_dir)
# Fetch the latest changes from the remote repository
subprocess.run(['git', 'fetch', 'origin', 'master'])
# Create a new branch from the fetched 'origin/master'
branch_name = f"{self.transcript_by}-{''.join(random.choices('0123456789', k=6))}"
subprocess.run(['git', 'checkout', '-b', branch_name, 'origin/master'])
# For each output with markdown, create a new commit in the new branch
for output in outputs:
if output.get('markdown'):
markdown_file = output['markdown']
destination_path = os.path.join(
self.bitcointranscripts_dir, output["transcript"].source.loc)
# Ensure the markdown file exists before copying
if os.path.exists(markdown_file):
shutil.copy(markdown_file, destination_path)
markdown_file_name = os.path.basename(markdown_file)
subprocess.run(['git', 'add', os.path.join(
destination_path, markdown_file_name)])
subprocess.run(
['git', 'commit', '-m', f'Add "{output["transcript"].title}" to {output["transcript"].source.loc}'])
else:
print(f"Markdown file {markdown_file} does not exist.")

# Push the branch to the remote repository
subprocess.run(['git', 'push', 'origin', branch_name])
# Delete branch locally
subprocess.run(['git', 'checkout', 'master'])
subprocess.run(['git', 'branch', '-D', branch_name])

def write_to_markdown_file(self, transcript: Transcript, output_dir):
"""Writes transcript to a markdown file and returns its absolute path
This file is the one submitted as part of the Pull Request to the
Expand Down Expand Up @@ -332,32 +380,23 @@ def write_to_json_file(self, transcript: Transcript):
self.logger.info(f"Transcription stored at {json_file}")
return json_file

def postprocess(self, transcript: Transcript):
def postprocess(self, transcript: Transcript) -> PostprocessOutput:
try:
result = transcript.result
result = {}
result["transcript"] = transcript
output_dir = f"{self.model_output_dir}/{transcript.source.loc}"
if self.markdown:
transcription_md_file = self.write_to_markdown_file(
if self.markdown or self.bitcointranscripts_dir:
result["markdown"] = self.write_to_markdown_file(
transcript,
output_dir if not self.test_mode else transcript.tmp_dir)
result = transcription_md_file
if self.open_pr:
application.create_pr(
absolute_path=transcription_md_file,
loc=transcript.source.source_file,
username=self.transcript_by,
curr_time=str(round(time.time() * 1000)),
title=transcript.title,
)
elif not self.test_mode:
transcript_json = transcript.to_json()
transcript_json["transcript_by"] = f"{self.transcript_by} via tstbtc v{__version__}"
if self.queuer:
return self.queuer.push_to_queue(transcript_json)
else:
# store payload for the user to manually send it to the queuer
payload_json_file = self.write_to_json_file(transcript)
result = payload_json_file
result["json"] = self.write_to_json_file(transcript)
return result
except Exception as e:
raise Exception(f"Error with postprocessing: {e}") from e
Expand Down
12 changes: 12 additions & 0 deletions app/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from typing import (
TypedDict,
Optional
)

from app.transcript import Transcript


class PostprocessOutput(TypedDict):
transcript: Transcript
markdown: Optional[str]
json: Optional[str]
8 changes: 4 additions & 4 deletions test/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ def test_audio_with_title():
)
transcription.add_transcription_source(source_file=source, title=title)
transcripts = transcription.start()
assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down Expand Up @@ -57,9 +57,9 @@ def test_audio_with_all_data():
source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
transcripts = transcription.start()

assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down
12 changes: 6 additions & 6 deletions test/test_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def test_video_with_title():
source, title, date, tags, category, speakers)
transcripts = transcription.start()

assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down Expand Up @@ -65,10 +65,10 @@ def test_video_with_all_options():
transcription.add_transcription_source(
source_file=source, title=title, date=date, tags=tags, category=category, speakers=speakers)
transcripts = transcription.start()
assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])

check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down Expand Up @@ -110,9 +110,9 @@ def test_video_with_chapters():
chapter_names.append(x.split("= ")[1].strip())
file.close()

assert os.path.isfile(transcripts[0])
assert os.path.isfile(transcripts[0]["markdown"])
check_md_file(
path=transcripts[0],
path=transcripts[0]["markdown"],
transcript_by=username,
media=source,
title=title,
Expand Down
24 changes: 13 additions & 11 deletions transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,11 @@ def print_help(ctx, param, value):
default=False,
help="Summarize the transcript [only available with deepgram]",
)
open_pr = click.option(
"-p",
"--PR",
github = click.option(
"--github",
is_flag=True,
default=False,
help="Open a PR on the bitcointranscripts repo",
help="Push transcripts to a new branch on the origin bitcointranscripts repo",
)
upload_to_s3 = click.option(
"-u",
Expand Down Expand Up @@ -195,7 +194,7 @@ def print_help(ctx, param, value):
@add_category
@add_loc
# Options for configuring the transcription postprocess
@open_pr
@github
@upload_to_s3
@save_to_markdown
@noqueue
Expand All @@ -213,7 +212,7 @@ def transcribe(
tags: list,
speakers: list,
category: list,
pr: bool,
github: bool,
deepgram: bool,
summarize: bool,
diarize: bool,
Expand Down Expand Up @@ -245,7 +244,7 @@ def transcribe(
try:
transcription = Transcription(
model=model,
pr=pr,
github=github,
summarize=summarize,
deepgram=deepgram,
diarize=diarize,
Expand Down Expand Up @@ -353,15 +352,15 @@ def preprocess(
)
@click.argument("metadata_json_file", nargs=1)
# Options for configuring the transcription postprocess
@open_pr
@github
@upload_to_s3
@save_to_markdown
@noqueue
@needs_review
def postprocess(
metadata_json_file,
service,
pr: bool,
github: bool,
upload: bool,
markdown: bool,
noqueue: bool,
Expand All @@ -376,7 +375,7 @@ def postprocess(
utils.check_if_valid_file_path(metadata_json_file)
transcription = Transcription(
deepgram=service == "deepgram",
pr=pr,
github=github,
upload=upload,
markdown=markdown,
queue=not noqueue,
Expand Down Expand Up @@ -406,7 +405,10 @@ def postprocess(
f"{service}_output"]
transcript_to_postprocess.result = transcription.service.finalize_transcript(
transcript_to_postprocess)
transcription.postprocess(transcript_to_postprocess)
postprocessed_transcript = transcription.postprocess(transcript_to_postprocess)

if transcription.bitcointranscripts_dir:
transcription.push_to_github([postprocessed_transcript])
except Exception as e:
logger.error(e)
traceback.print_exc()
Expand Down

0 comments on commit 8d66953

Please sign in to comment.