Skip to content

Commit

Permalink
Merge pull request #40 from databio/restructuring
Browse files Browse the repository at this point in the history
Restructuring
  • Loading branch information
khoroshevskyi authored Feb 21, 2024
2 parents 57bc897 + ffa08e9 commit 3dbd1db
Show file tree
Hide file tree
Showing 10 changed files with 255 additions and 335 deletions.
7 changes: 7 additions & 0 deletions bedboss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,10 @@
datefmt="%H:%M:%S",
fmt="[%(levelname)s] [%(asctime)s] [BBCONF] %(message)s",
)

_LOGGER_BBCONF = logging.getLogger("pephubclient")
coloredlogs.install(
logger=_LOGGER_BBCONF,
datefmt="%H:%M:%S",
fmt="[%(levelname)s] [%(asctime)s] [PEPHUBCLIENT] %(message)s",
)
165 changes: 149 additions & 16 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
import logmuse
import peppy
from eido import validate_project
import bbconf

import pephubclient
from pephubclient import PEPHubClient
from pephubclient.helpers import is_registry_path
import bbconf
from ubiquerg import parse_registry_path

from bedboss.bedstat.bedstat import bedstat
from bedboss.bedmaker.bedmaker import make_all
Expand All @@ -21,7 +24,11 @@
BED_FOLDER_NAME,
BIGBED_FOLDER_NAME,
BEDBOSS_PEP_SCHEMA_PATH,
OUTPUT_FOLDER_NAME,
BEDSTAT_OUTPUT,
BED_PEP_REGISTRY,
)
from bedboss.models import BedMetadata
from bedboss.utils import (
extract_file_name,
standardize_genome_name,
Expand All @@ -33,6 +40,79 @@
_LOGGER = logging.getLogger("bedboss")


def load_to_pephub(
pep_registry_path: str, bed_digest: str, genome: str, metadata: dict
) -> None:
"""
Load bedfile and metadata to PEPHUB
:param str pep_registry_path: registry path to pep on pephub
:param str bed_digest: unique bedfile identifier
:param str genome: genome associated with bedfile
:param dict metadata: Any other metadata that has been collected
:return None
"""

if is_registry_path(pep_registry_path):
parsed_pep_dict = parse_registry_path(pep_registry_path)

# Combine data into a dict for sending to pephub
sample_data = {}
sample_data.update({"sample_name": bed_digest, "genome": genome})

metadata = BedMetadata(**metadata).model_dump()

for key, value in metadata.items():
# TODO: Confirm this key is in the schema
# Then update sample_data
sample_data.update({key: value})

try:
PEPHubClient().sample.create(
namespace=parsed_pep_dict["namespace"],
name=parsed_pep_dict["item"],
tag=parsed_pep_dict["tag"],
sample_name=bed_digest,
overwrite=True,
sample_dict=sample_data,
)

except Exception as e: # Need more specific exception
_LOGGER.error(f"Failed to upload BEDFILE to PEPhub: See {e}")
else:
_LOGGER.error(f"{pep_registry_path} is not a valid registry path")


def load_to_s3(
output_folder: str,
pm: pypiper.PipelineManager,
bed_file: str,
digest: str,
bigbed_file: str = None,
) -> None:
"""
Load bedfiles and statistics to s3
:param output_folder: base output folder
:param pm: pipelineManager object
:param bed_file: bedfile name
:param digest: bedfile digest
:param bigbed_file: bigbed file name
:return: NoReturn
"""
command = f"aws s3 cp {os.path.join(output_folder, bed_file)} s3://bedbase/{BED_FOLDER_NAME}"
_LOGGER.info("Uploading to s3 bed file")
pm.run(cmd=command, lock_name="s3_sync_bed")
if bigbed_file:
command = f"aws s3 cp {os.path.join(output_folder, bigbed_file)} s3://bedbase/{BIGBED_FOLDER_NAME}"
_LOGGER.info("Uploading to s3 bigbed file")
pm.run(cmd=command, lock_name="s3_sync_bigbed")
command = f"aws s3 sync {os.path.join(output_folder, OUTPUT_FOLDER_NAME,BEDSTAT_OUTPUT, digest)} s3://bedbase/{OUTPUT_FOLDER_NAME}/{BEDSTAT_OUTPUT}/{digest} --size-only"
_LOGGER.info("Uploading to s3 bed statistic files")
pm.run(cmd=command, lock_name="s3_sync_bedstat")


def run_all(
sample_name: str,
input_file: str,
Expand All @@ -49,9 +129,9 @@ def run_all(
ensdb: str = None,
other_metadata: dict = None,
just_db_commit: bool = False,
no_db_commit: bool = False,
db_commit: bool = True,
force_overwrite: bool = False,
skip_qdrant: bool = True,
upload_qdrant: bool = False,
upload_s3: bool = False,
upload_pephub: bool = False,
pm: pypiper.PipelineManager = None,
Expand Down Expand Up @@ -79,8 +159,9 @@ def run_all(
(basically genomes that's not in GDdata)
:param bool just_db_commit: whether just to commit the JSON to the database (default: False)
:param bool force_overwrite: force overwrite analysis
:param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False)
:param bool skip_qdrant: whether to skip qdrant indexing
:param bool db_commit: whether the JSON commit to the database should be skipped (default: False)
:param bool upload_qdrant: whether to skip qdrant indexing
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param pypiper.PipelineManager pm: pypiper object
Expand All @@ -91,6 +172,9 @@ def run_all(
if isinstance(bedbase_config, str):
if not check_db_connection(bedbase_config=bedbase_config):
raise BedBossException("Database connection failed. Exiting...")
bbc = bbconf.BedBaseConf(config_path=bedbase_config, database_only=True)
else:
bbc = bedbase_config

file_name = extract_file_name(input_file)
genome = standardize_genome_name(genome)
Expand Down Expand Up @@ -135,27 +219,76 @@ def run_all(
pm=pm,
)
if not other_metadata:
other_metadata = classification_meta
else:
other_metadata.update(classification_meta)
other_metadata = {}

bed_digest = classification_meta.get("digest")

bed_digest = bedstat(
statistics_dict = bedstat(
bedfile=output_bed,
outfolder=outfolder,
bedbase_config=bedbase_config,
genome=genome,
ensdb=ensdb,
bed_digest=bed_digest,
open_signal_matrix=open_signal_matrix,
bigbed=output_bigbed,
other_metadata=other_metadata,
just_db_commit=just_db_commit,
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
pm=pm,
)
statistics_dict.update(
{
"bed_type": classification_meta["bed_type"],
"bed_format": classification_meta["bed_format"],
}
)

if db_commit:
bbc.bed.report(
record_identifier=bed_digest,
values=statistics_dict,
force_overwrite=force_overwrite,
)

if upload_s3:
_LOGGER.info(f"Uploading '{bed_digest}' data to S3 ...")
load_to_s3(
os.path.abspath(outfolder), pm, output_bed, bed_digest, output_bigbed
)
else:
_LOGGER.info(
f"Skipping uploading '{bed_digest}' data to S3. 'upload_s3' is set to False. "
)

if upload_qdrant:
_LOGGER.info(f"Adding '{bed_digest}' vector to Qdrant ...")

bbc.add_bed_to_qdrant(
bed_id=bed_digest,
bed_file=output_bed,
payload={"digest": bed_digest},
)
bbc.bed.report(
record_identifier=bed_digest,
values={"added_to_qdrant": True},
force_overwrite=True,
)
else:
_LOGGER.info(
f"Skipping adding '{bed_digest}' vector to Qdrant, 'skip_qdrant' is set to True. "
)

if upload_pephub:
_LOGGER.info(f"Uploading metadata of '{bed_digest}' TO PEPhub ...")
load_to_pephub(
pep_registry_path=BED_PEP_REGISTRY,
bed_digest=bed_digest,
genome=genome,
metadata=other_metadata,
)
else:
_LOGGER.info(
f"Metadata of '{bed_digest}' is NOT uploaded to PEPhub. 'upload_pephub' is set to False. "
)

if stop_pipeline:
pm.stop_pipeline()

Expand Down
16 changes: 12 additions & 4 deletions bedboss/bedmaker/bedmaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from refgenconf.exceptions import MissingGenomeError
from yacman.exceptions import UndefinedAliasError
from ubiquerg import is_command_callable
from geniml.io import RegionSet

from bedboss.bedclassifier.bedclassifier import get_bed_type
from bedboss.bedqc.bedqc import bedqc
Expand Down Expand Up @@ -180,10 +181,10 @@ def make(self) -> dict:
# converting to bed.gz if needed
self.make_bed()
try:
bed_type, format_type = get_bed_type(self.input_file)
bed_type, bed_format = get_bed_type(self.input_file)
except Exception:
# we need this exception to catch the case when the input file is not a bed file
bed_type, format_type = get_bed_type(self.output_bed)
bed_type, bed_format = get_bed_type(self.output_bed)
if self.check_qc:
bedqc(
self.output_bed,
Expand All @@ -195,8 +196,9 @@ def make(self) -> dict:

return {
"bed_type": bed_type,
"file_type": format_type,
"bed_format": bed_format,
"genome": self.genome,
"digest": RegionSet(self.output_bed).identifier,
}

def make_bed(self) -> None:
Expand Down Expand Up @@ -549,7 +551,13 @@ def make_all(
ChrUn chromosomes
:param check_qc: run quality control during bedmaking
:param pm: pypiper object
:return: dict with bed classificator results
:return: dict with generated bed metadata:
{
"bed_type": bed_type. e.g. bed, bigbed
"bed_format": bed_format. e.g. narrowpeak, broadpeak
"genome": genome of the sample,
"digest": bedfile identifier,
}
"""
return BedMaker(
input_file=input_file,
Expand Down
Loading

0 comments on commit 3dbd1db

Please sign in to comment.