Skip to content

Commit

Permalink
Added uploading to s3 to bedstat
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi committed Dec 7, 2023
1 parent 9e2c7e6 commit 92eac34
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 29 deletions.
25 changes: 4 additions & 21 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def run_all(
no_db_commit: bool = False,
force_overwrite: bool = False,
skip_qdrant: bool = True,
upload_s3: bool = False,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> str:
Expand Down Expand Up @@ -123,6 +124,7 @@ def run_all(
:param force_overwrite: force overwrite analysis
:param no_db_commit: whether the JSON commit to the database should be skipped (default: False)
:param skip_qdrant: whether to skip qdrant indexing
:param upload_s3: whether to upload to s3
:param pm: pypiper object
:return: bed digest
"""
Expand Down Expand Up @@ -195,6 +197,7 @@ def run_all(
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
pm=pm,
)
return bed_digest
Expand Down Expand Up @@ -284,12 +287,11 @@ def insert_pep(
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
pm=pm,
)
pep.samples[i].record_identifier = bed_id

if upload_s3:
load_to_s3(output_folder, pm)
else:
_LOGGER.info("Skipping uploading to s3. Flag `upload_s3` is set to False")

Expand All @@ -306,25 +308,6 @@ def insert_pep(
)


def load_to_s3(output_folder: str, pm: pypiper.PipelineManager) -> NoReturn:
"""
Load bedfiles and statistics to s3
:param output_folder: base output folder
:param pm: pipelineManager object
:return: NoReturn
"""
command = f"aws s3 sync {os.path.join(output_folder, BED_FOLDER_NAME)} s3://bedbase/{BED_FOLDER_NAME} --size-only --exclude 'bed_qc/*'"
_LOGGER.info("Uploading to s3 bed files")
pm.run(cmd=command, lock_name="s3_sync_big")
command = f"aws s3 sync {os.path.join(output_folder, BIGBED_FOLDER_NAME)} s3://bedbase/{BIGBED_FOLDER_NAME} --size-only"
_LOGGER.info("Uploading to s3 bigbed files")
pm.run(cmd=command, lock_name="s3_sync_bigbed")
command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME)} s3://bedbase/{OUTPUT_FOLDER_NAME} --size-only"
_LOGGER.info("Uploading to s3 bed statistics files")
pm.run(cmd=command, lock_name="s3_sync_bedstat")


def main(test_args: dict = None) -> NoReturn:
"""
Run pipeline that was specified in as positional argument.
Expand Down
58 changes: 50 additions & 8 deletions bedboss/bedstat/bedstat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
from typing import Union, NoReturn
import json
import os
import requests
Expand All @@ -7,7 +7,12 @@
import logging
from geniml.io import RegionSet

from bedboss.const import OUTPUT_FOLDER_NAME
from bedboss.const import (
OUTPUT_FOLDER_NAME,
BED_FOLDER_NAME,
BIGBED_FOLDER_NAME,
BEDSTAT_OUTPUT,
)


_LOGGER = logging.getLogger("bedboss")
Expand All @@ -33,6 +38,35 @@ def convert_unit(size_in_bytes: int) -> str:
return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB"


def load_to_s3(
output_folder: str,
pm: pypiper.PipelineManager,
bed_file: str,
digest: str,
bigbed_file: str = None,
) -> None:
"""
Load bedfiles and statistics to s3
:param output_folder: base output folder
:param pm: pipelineManager object
:param bed_file: bedfile name
:param digest: bedfile digest
:param bigbed_file: bigbed file name
:return: NoReturn
"""
command = f"aws s3 cp {os.path.join(output_folder, bed_file)} s3://bedbase/{BED_FOLDER_NAME}"
_LOGGER.info("Uploading to s3 bed files")
pm.run(cmd=command, lock_name="s3_sync_bed")
if bigbed_file:
command = f"aws s3 cp {os.path.join(output_folder, bigbed_file)} s3://bedbase/{BIGBED_FOLDER_NAME}"
_LOGGER.info("Uploading to s3 bigbed files")
pm.run(cmd=command, lock_name="s3_sync_bigbed")
command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME,BEDSTAT_OUTPUT, digest)} s3://bedbase/{OUTPUT_FOLDER_NAME}/{BEDSTAT_OUTPUT}/{digest} --size-only"
_LOGGER.info("Uploading to s3 bed statistics files")
pm.run(cmd=command, lock_name="s3_sync_bedstat")


def bedstat(
bedfile: str,
bedbase_config: Union[str, bbconf.BedBaseConf],
Expand All @@ -49,6 +83,7 @@ def bedstat(
no_db_commit: bool = False,
force_overwrite: bool = False,
skip_qdrant: bool = True,
upload_s3: bool = False,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> str:
Expand Down Expand Up @@ -76,13 +111,14 @@ def bedstat(
skipped
:param skip_qdrant: whether to skip qdrant indexing [Default: True]
:param bool force_overwrite: whether to overwrite the existing record
:param upload_s3: whether to upload the bed file to s3
:param pm: pypiper object
:return: bed_digest: the digest of the bed file
"""
# TODO why are we no longer using bbconf to get the output path?
# outfolder_stats = bbc.get_bedstat_output_path()
outfolder_stats = os.path.join(outfolder, OUTPUT_FOLDER_NAME, "bedstat_output")
outfolder_stats = os.path.join(outfolder, OUTPUT_FOLDER_NAME, BEDSTAT_OUTPUT)
try:
os.makedirs(outfolder_stats)
except FileExistsError:
Expand All @@ -98,14 +134,16 @@ def bedstat(
bedfile_name = os.path.split(bedfile)[1]

fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]
outfolder = os.path.abspath(os.path.join(outfolder_stats, bed_digest))
outfolder_stats_results = os.path.abspath(os.path.join(outfolder_stats, bed_digest))
try:
os.makedirs(outfolder)
os.makedirs(outfolder_stats_results)
except FileExistsError:
pass
json_file_path = os.path.abspath(os.path.join(outfolder, fileid + ".json"))
json_file_path = os.path.abspath(
os.path.join(outfolder_stats_results, fileid + ".json")
)
json_plots_file_path = os.path.abspath(
os.path.join(outfolder, fileid + "_plots.json")
os.path.join(outfolder_stats_results, fileid + "_plots.json")
)
bed_relpath = os.path.relpath(
bedfile,
Expand Down Expand Up @@ -145,7 +183,7 @@ def bedstat(
command = (
f"Rscript {rscript_path} --bedfilePath={bedfile} "
f"--fileId={fileid} --openSignalMatrix={open_signal_matrix} "
f"--outputFolder={outfolder} --genome={genome} "
f"--outputFolder={outfolder_stats_results} --genome={genome} "
f"--ensdb={ensdb} --digest={bed_digest}"
)

Expand Down Expand Up @@ -240,6 +278,10 @@ def bedstat(
values=data,
force_overwrite=force_overwrite,
)
if upload_s3:
load_to_s3(
os.path.abspath(outfolder), pm, bed_relpath, bed_digest, bigbed_relpath
)

if not skip_qdrant:
bbc.add_bed_to_qdrant(
Expand Down
1 change: 1 addition & 0 deletions bedboss/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
BED_FOLDER_NAME = "bed_files"
BIGBED_FOLDER_NAME = "bigbed_files"
OUTPUT_FOLDER_NAME = "output"
BEDSTAT_OUTPUT = "bedstat_output"
QC_FOLDER_NAME = "bed_qc"

# bedmaker
Expand Down

0 comments on commit 92eac34

Please sign in to comment.