add support for configuration file

Support a `configuration.ini` file to set default values for various options and flags, offering greater control over the application's behavior
bitcointranscripts · Jun 27, 2024 · 386bcb8 · 386bcb8
1 parent f905534
commit 386bcb8
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 23 deletions.
diff --git a/Readme.md → README.md b/Readme.md → README.md
@@ -60,6 +60,23 @@ This transcription tool operates through a structured four-stage process:
     - for other users, follow the instruction on
       their [site](https://ffmpeg.org/) to install
 
+- To use a specific [configuration profile](#configuration), set the `PROFILE` variable in your `.env` file.
+
+
+## Configuration
+
+This application supports configuration via a `config.ini` file.
+This file allows you to set default values for various options and flags, reducing the need to specify them on the command line every time.
+Additionally, the configuration file can include options not available through the command line, offering greater flexibility and control over the application's behavior.
+
+### Creating a Configuration File
+
+An example configuration file named `config.ini.example` is included in the repository.
+To use it, copy it to `config.ini` and modify it according to your needs:
+```sh
+cp config.ini.example config.ini
+```
+
 ## Install/Uninstall
 
 Navigate to the application directory and run the below commands:
@@ -68,9 +85,8 @@ Navigate to the application directory and run the below commands:
 
 `source venv/bin/activate` activates the virtual environment
 
-`pip3 install . --use-pep517` to install the application
+`pip3 install .` to install the application
 
-To check the version:
 `tstbtc --version` view the application version
 
 `tstbtc --help` view the application help

diff --git a/app/config.py b/app/config.py
@@ -0,0 +1,12 @@
+import configparser
+from dotenv import dotenv_values
+
+def read_config(profile='DEFAULT'):
+    config = configparser.ConfigParser()
+    config.read('config.ini')
+
+    return config[profile]
+
+# Get the current profile from an environment variable or default to 'DEFAULT'
+env = dotenv_values(".env")
+config = read_config(env["PROFILE"])
diff --git a/app/services/deepgram.py b/app/services/deepgram.py
@@ -11,6 +11,7 @@
     application,
     utils
 )
+from app.config import config as config_profile
 from app.data_writer import DataWriter
 from app.logging import get_logger
 from app.media_processor import MediaProcessor
@@ -33,14 +34,15 @@ def __init__(self, summarize, diarize, upload, data_writer: DataWriter):
         self.diarize = diarize
         self.upload = upload
         self.data_writer = data_writer
-        self.one_sentence_per_line = True
+        self.one_sentence_per_line = config_profile.getboolean('one_sentence_per_line', True)
         self.dev_mode = False  # Extra capabilities during development mode
         self.max_audio_length = 3600.0  # 60 minutes in seconds
         self.processor = MediaProcessor(chunk_length=1200.0)
 
     def audio_to_text(self, audio_file, chunk=None):
+        language = config_profile.get('language','en')
         logger.info(
-            f"Transcribing audio {f'(chunk {chunk}) ' if chunk else ''}to text using deepgram...")
+            f"Transcribing audio {f'(chunk {chunk}) ' if chunk else ''}to text using deepgram[{language}]...")
         try:
             config = dotenv_values(".env")
             dg_client = deepgram.Deepgram(config["DEEPGRAM_API_KEY"])
@@ -57,6 +59,7 @@ def audio_to_text(self, audio_file, chunk=None):
                         "smart_formatting": True,
                         "summarize": self.summarize,
                         "model": "whisper-large",
+                        "language": language,
                     },
                 )
                 audio.close()

diff --git a/transcriber.py b/transcriber.py
@@ -8,9 +8,10 @@
 from app import (
     __app_name__,
     __version__,
+    commands,
     utils
 )
-from app.commands import queue
+from app.config import config
 from app.logging import configure_logger, get_logger
 from app.transcription import Transcription
 from app.types import GitHubMode
@@ -24,7 +25,6 @@ def print_version(ctx, param, value):
     click.echo(f"{__app_name__} v{__version__}")
     ctx.exit()
 
-
 @click.option(
     "-v",
     "--version",
@@ -38,14 +38,12 @@ def print_version(ctx, param, value):
 def cli():
     pass
 
-
 def print_help(ctx, param, value):
     if not value or ctx.resilient_parsing:
         return
     logging.info(ctx.get_help())
     ctx.exit()
 
-
 whisper = click.option(
     "-m",
     "--model",
@@ -62,35 +60,36 @@ def print_help(ctx, param, value):
             "large-v2",
         ]
     ),
-    default="tiny.en",
+    default=config.get('model', 'tiny.en'),
     show_default=True,
     help="Select which whisper model to use for the transcription",
 )
 deepgram = click.option(
     "-D",
     "--deepgram",
     is_flag=True,
-    default=False,
+    default=config.getboolean('deepgram', False),
     help="Use deepgram for transcription",
 )
 diarize = click.option(
     "-M",
     "--diarize",
     is_flag=True,
-    default=False,
+    default=config.getboolean('diarize', False),
     help="Supply this flag if you have multiple speakers AKA "
     "want to diarize the content",
 )
 summarize = click.option(
     "-S",
     "--summarize",
     is_flag=True,
-    default=False,
+    default=config.getboolean('summarize', False),
     help="Summarize the transcript [only available with deepgram]",
 )
 cutoff_date = click.option(
     "--cutoff-date",
     type=str,
+    default=config.get('cutoff_date', None),
     help=("Specify a cutoff date (in YYYY-MM-DD format) to process only sources "
           "published after this date. Sources with a publication date on or before "
           "the cutoff will be excluded from processing. This option is useful for "
@@ -100,7 +99,7 @@ def print_help(ctx, param, value):
 github = click.option(
     "--github",
     type=click.Choice(["remote", "local", "none"]),
-    default="none",
+    default=config.get('github', 'none'),
     help=("Specify the GitHub operation mode."
           "'remote': Create a new branch, push changes to it, and push it to the origin bitcointranscripts repo. "
           "'local': Commit changes to the current local branch without pushing to the remote repo."
@@ -111,46 +110,46 @@ def print_help(ctx, param, value):
     "-u",
     "--upload",
     is_flag=True,
-    default=False,
+    default=config.getboolean('upload_to_s3', False),
     help="Upload processed model files to AWS S3",
 )
 save_to_markdown = click.option(
     "--markdown",
     is_flag=True,
-    default=False,
+    default=config.getboolean('save_to_markdown', False),
     help="Save the resulting transcript to a markdown format supported by bitcointranscripts",
 )
 noqueue = click.option(
     "--noqueue",
     is_flag=True,
-    default=False,
+    default=config.getboolean('noqueue', False),
     help="Do not push the resulting transcript to the Queuer backend",
 )
 needs_review = click.option(
     "--needs-review",
     is_flag=True,
-    default=False,
+    default=config.getboolean('needs_review', False),
     help="Add 'needs review' flag to the resulting transcript",
 )
 model_output_dir = click.option(
     "-o",
     "--model_output_dir",
     type=str,
-    default="local_models/",
+    default=config.get('model_output_dir', 'local_models/'),
     show_default=True,
     help="Set the directory for saving model outputs",
 )
 nocleanup = click.option(
     "--nocleanup",
     is_flag=True,
-    default=False,
+    default=config.getboolean('nocleanup', False),
     help="Do not remove temp files on exit",
 )
 verbose_logging = click.option(
     "-V",
     "--verbose",
     is_flag=True,
-    default=False,
+    default=config.getboolean('verbose_logging', False),
     help="Supply this flag to enable verbose logging",
 )
 
@@ -190,7 +189,6 @@ def print_help(ctx, param, value):
     help="Add a category to the transcript's metadata (can be used multiple times)",
 )
 
-
 @cli.command()
 @click.argument("source", nargs=1)
 # Available transcription models and services
@@ -462,8 +460,7 @@ def postprocess(
         logger.error(e)
         traceback.print_exc()
 
-
-cli.add_command(queue.commands)
+cli.add_command(commands.queue)
 
 if __name__ == '__main__':
     cli()