Skip to content

Commit

Permalink
Merge pull request #33 from databio/dev_phc_bed_Metadata
Browse files Browse the repository at this point in the history
Allow uploading meta data from input pep to an output pep via registry path
  • Loading branch information
khoroshevskyi authored Feb 12, 2024
2 parents 5054da2 + 9bcaee4 commit 52aba4d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
10 changes: 10 additions & 0 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def run_all(
open_signal_matrix: str = None,
ensdb: str = None,
treatment: str = None,
pep_sample_dict: dict = None,
description: str = None,
cell_type: str = None,
other_metadata: dict = None,
Expand All @@ -95,6 +96,7 @@ def run_all(
force_overwrite: bool = False,
skip_qdrant: bool = True,
upload_s3: bool = False,
upload_pephub: bool = False,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> str:
Expand All @@ -116,6 +118,7 @@ def run_all(
:param str description: a description of the bed file
:param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
:param str treatment: a treatment of the bed file
:param dict pep_sample_dict: a dict containing all attributes from the sample
:param str cell_type: a cell type of the bed file
:param dict other_metadata: a dictionary of other metadata to pass
:param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata [optional]
Expand All @@ -125,6 +128,7 @@ def run_all(
:param bool no_db_commit: whether the JSON commit to the database should be skipped (default: False)
:param bool skip_qdrant: whether to skip qdrant indexing
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param pypiper.PipelineManager pm: pypiper object
:return str bed_digest: bed digest
"""
Expand Down Expand Up @@ -191,13 +195,15 @@ def run_all(
bigbed=output_bigbed,
description=description,
treatment=treatment,
pep_sample_dict=pep_sample_dict,
cell_type=cell_type,
other_metadata=other_metadata,
just_db_commit=just_db_commit,
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
pm=pm,
)
return bed_digest
Expand All @@ -217,6 +223,7 @@ def insert_pep(
no_db_commit: bool = False,
force_overwrite: bool = False,
upload_s3: bool = False,
upload_pephub: bool = False,
pm: pypiper.PipelineManager = None,
*args,
**kwargs,
Expand All @@ -238,6 +245,7 @@ def insert_pep(
:param bool no_db_commit: whether the JSON commit to the database should be skipped
:param bool force_overwrite: whether to overwrite the existing record
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param pypiper.PipelineManager pm: pypiper object
:return: None
"""
Expand Down Expand Up @@ -277,6 +285,7 @@ def insert_pep(
description=pep_sample.get("description"),
cell_type=pep_sample.get("cell_type"),
treatment=pep_sample.get("treatment"),
pep_sample_dict=pep_sample.to_dict(),
outfolder=output_folder,
bedbase_config=bbc,
rfg_config=rfg_config,
Expand All @@ -288,6 +297,7 @@ def insert_pep(
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
pm=pm,
)
pep.samples[i].record_identifier = bed_id
Expand Down
67 changes: 67 additions & 0 deletions bedboss/bedstat/bedstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import pypiper
import bbconf
import logging
import pephubclient as phc
from geniml.io import RegionSet
from pephubclient import PEPHubClient
from pephubclient.helpers import is_registry_path
from ubiquerg import parse_registry_path

from bedboss.const import (
OUTPUT_FOLDER_NAME,
Expand All @@ -21,6 +25,8 @@
os.path.dirname(os.path.realpath(__file__)), "pep_schema.yaml"
)

BED_PEP_REGISTRY = "databio/allbeds:bedbase"


def convert_unit(size_in_bytes: int) -> str:
"""
Expand All @@ -38,6 +44,48 @@ def convert_unit(size_in_bytes: int) -> str:
return str(round(size_in_bytes / (1024 * 1024 * 1024))) + "GB"


def load_to_pephub(
pep_registry_path: str, bed_digest: str, genome: str, metadata: dict
) -> None:
"""
Load bedfile and metadata to PEPHUB
:param str pep_registry_path: registry path to pep on pephub
:param str bed_digest: unique bedfile identifier
:param str genome: genome associated with bedfile
:param dict metadata: Any other metadata that has been collected
:return None
"""

if is_registry_path(pep_registry_path):
parsed_pep_dict = parse_registry_path(pep_registry_path)

# Combine data into a dict for sending to pephub
sample_data = {}
sample_data.update({"sample_name": bed_digest, "genome": genome})

for key, value in metadata.items():
# TODO Confirm this key is in the schema
# Then update sample_data
sample_data.update({key: value})

try:
PEPHubClient().sample.create(
namespace=parsed_pep_dict["namespace"],
name=parsed_pep_dict["item"],
tag=parsed_pep_dict["item"],
sample_name=bed_digest,
overwrite=True,
sample_dict=sample_data,
)

except Exception as e: # Need more specific exception
_LOGGER.warning(f"Failed to upload BEDFILE to Bedbase: See {e}")
else:
_LOGGER.warning(f"{pep_registry_path} is not a valid registry path")


def load_to_s3(
output_folder: str,
pm: pypiper.PipelineManager,
Expand Down Expand Up @@ -76,6 +124,7 @@ def bedstat(
open_signal_matrix: str = None,
bigbed: str = None,
treatment: str = None,
pep_sample_dict: dict = None,
description: str = None,
cell_type: str = None,
other_metadata: dict = None,
Expand All @@ -84,6 +133,7 @@ def bedstat(
force_overwrite: bool = False,
skip_qdrant: bool = True,
upload_s3: bool = False,
upload_pephub: bool = False,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> str:
Expand All @@ -104,6 +154,7 @@ def bedstat(
not in GDdata
:param str description: a description of the bed file
:param str treatment: a treatment of the bed file
:param dict pep_sample_dict: a dict containing all attributes from the sample
:param str cell_type: a cell type of the bed file
:param dict other_metadata: a dictionary of other metadata to pass
:param bool just_db_commit: whether just to commit the JSON to the database
Expand All @@ -112,12 +163,14 @@ def bedstat(
:param skip_qdrant: whether to skip qdrant indexing [Default: True]
:param bool force_overwrite: whether to overwrite the existing record
:param upload_s3: whether to upload the bed file to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param pm: pypiper object
:return: bed_digest: the digest of the bed file
"""
# TODO why are we no longer using bbconf to get the output path?
# outfolder_stats = bbc.get_bedstat_output_path()

outfolder_stats = os.path.join(outfolder, OUTPUT_FOLDER_NAME, BEDSTAT_OUTPUT)
try:
os.makedirs(outfolder_stats)
Expand Down Expand Up @@ -211,6 +264,11 @@ def bedstat(
}
)

# For now, add all the *other* attributes to other_metadata
for key, value in pep_sample_dict.items():
if key not in list(other_metadata.keys()):
other_metadata.update({key: value})

# unlist the data, since the output of regionstat.R is a dict of lists of
# length 1 and force keys to lower to correspond with the
# postgres column identifiers
Expand Down Expand Up @@ -295,6 +353,15 @@ def bedstat(
force_overwrite=True,
)

if upload_pephub:
_LOGGER.info("UPLOADING TO PEPHUB...")
load_to_pephub(
pep_registry_path=BED_PEP_REGISTRY,
bed_digest=bed_digest,
genome=genome,
metadata=other_metadata,
)

if stop_pipeline:
pm.stop_pipeline()
return bed_digest
10 changes: 10 additions & 0 deletions bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ def build_argparser() -> ArgumentParser:
action="store_true",
help="whether to skip qdrant indexing",
)
sub_all.add_argument(
"--upload-pephub",
action="store_true",
help="upload to pephub",
)

# all-pep
sub_all_pep.add_argument(
Expand Down Expand Up @@ -245,6 +250,11 @@ def build_argparser() -> ArgumentParser:
"Before uploading you have to set up all necessury env vars: "
"AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_ENDPOINT_URL. [Default: False]",
)
sub_all_pep.add_argument(
"--upload-pephub",
action="store_true",
help="upload to pephub",
)

# bed_qc
sub_qc.add_argument(
Expand Down

0 comments on commit 52aba4d

Please sign in to comment.