Skip to content

Commit

Permalink
Merge pull request #44 from databio/restructuring
Browse files Browse the repository at this point in the history
Restructuring
  • Loading branch information
khoroshevskyi authored Feb 26, 2024
2 parents 3dbd1db + 96ca0a8 commit 884d99b
Show file tree
Hide file tree
Showing 11 changed files with 227 additions and 133 deletions.
101 changes: 63 additions & 38 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import peppy
from eido import validate_project
import bbconf
import subprocess

import pephubclient
from pephubclient import PEPHubClient
Expand All @@ -28,7 +29,7 @@
BEDSTAT_OUTPUT,
BED_PEP_REGISTRY,
)
from bedboss.models import BedMetadata
from bedboss.models import BedMetadata, BedStatCLIModel, BedMakerCLIModel, BedQCCLIModel
from bedboss.utils import (
extract_file_name,
standardize_genome_name,
Expand Down Expand Up @@ -113,6 +114,18 @@ def load_to_s3(
pm.run(cmd=command, lock_name="s3_sync_bedstat")


def requirements_check() -> None:
"""
Check if all requirements are installed
:return: None
"""
_LOGGER.info("Checking requirements...")
subprocess.run(
["bash", f"{os.path.dirname(os.path.abspath(__file__))}/requirements_test.sh"]
)


def run_all(
sample_name: str,
input_file: str,
Expand Down Expand Up @@ -301,7 +314,6 @@ def insert_pep(
pep: Union[str, peppy.Project],
rfg_config: str = None,
create_bedset: bool = True,
skip_qdrant: bool = True,
check_qc: bool = True,
standardize: bool = False,
ensdb: str = None,
Expand All @@ -310,6 +322,7 @@ def insert_pep(
force_overwrite: bool = False,
upload_s3: bool = False,
upload_pephub: bool = False,
upload_qdrant: bool = False,
pm: pypiper.PipelineManager = None,
*args,
**kwargs,
Expand All @@ -323,19 +336,22 @@ def insert_pep(
:param Union[str, peppy.Project] pep: path to the pep file or pephub registry path
:param str rfg_config: path to the genome config file (refgenie)
:param bool create_bedset: whether to create bedset
:param bool skip_qdrant: whether to skip qdrant indexing
:param bool upload_qdrant: whether to upload bedfiles to qdrant
:param bool check_qc: whether to run quality control during badmaking
:param bool standardize: "Standardize bed files: remove non-standard chromosomes and headers if necessary Default: False"
:param str ensdb: a full path to the ensdb gtf file required for genomes not in GDdata
:param bool just_db_commit: whether just to commit the JSON to the database
:param bool no_db_commit: whether the JSON commit to the database should be skipped
:param bool just_db_commit: whether save only to the database (Without saving locally )
:param bool db_commit: whether to upload data to the database
:param bool force_overwrite: whether to overwrite the existing record
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param bool upload_qdrant: whether to execute qdrant indexing
:param pypiper.PipelineManager pm: pypiper object
:return: None
"""

_LOGGER.warning(f"!Unused arguments: {kwargs}")
failed_samples = []
pephub_registry_path = None
if isinstance(pep, peppy.Project):
pass
Expand All @@ -354,36 +370,41 @@ def insert_pep(

for i, pep_sample in enumerate(pep.samples):
_LOGGER.info(f"Running bedboss pipeline for {pep_sample.sample_name}")

if pep_sample.get("file_type").lower() == "narrowpeak":
is_narrow_peak = True
if pep_sample.get("file_type"):
if pep_sample.get("file_type").lower() == "narrowpeak":
is_narrow_peak = True
else:
is_narrow_peak = False
else:
is_narrow_peak = False

bed_id = run_all(
sample_name=pep_sample.sample_name,
input_file=pep_sample.input_file,
input_type=pep_sample.input_type,
genome=pep_sample.genome,
narrowpeak=is_narrow_peak,
chrom_sizes=pep_sample.get("chrom_sizes"),
open_signal_matrix=pep_sample.get("open_signal_matrix"),
other_metadata=pep_sample.to_dict(),
outfolder=output_folder,
bedbase_config=bbc,
rfg_config=rfg_config,
check_qc=check_qc,
standardize=standardize,
ensdb=ensdb,
just_db_commit=just_db_commit,
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
skip_qdrant=skip_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
pm=pm,
)
pep.samples[i].record_identifier = bed_id
try:
bed_id = run_all(
sample_name=pep_sample.sample_name,
input_file=pep_sample.input_file,
input_type=pep_sample.input_type,
genome=pep_sample.genome,
narrowpeak=is_narrow_peak,
chrom_sizes=pep_sample.get("chrom_sizes"),
open_signal_matrix=pep_sample.get("open_signal_matrix"),
other_metadata=pep_sample.to_dict(),
outfolder=output_folder,
bedbase_config=bbc,
rfg_config=rfg_config,
check_qc=check_qc,
standardize=standardize,
ensdb=ensdb,
just_db_commit=just_db_commit,
no_db_commit=no_db_commit,
force_overwrite=force_overwrite,
upload_qdrant=upload_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
pm=pm,
)
pep.samples[i].record_identifier = bed_id
except BedBossException as e:
_LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
failed_samples.append(pep_sample.sample_name)

else:
_LOGGER.info("Skipping uploading to s3. Flag `upload_s3` is set to False")
Expand All @@ -394,11 +415,13 @@ def insert_pep(
bedbase_config=bbc,
bedset_pep=pep,
pephub_registry_path=pephub_registry_path,
upload_pephub=upload_pephub,
)
else:
_LOGGER.info(
f"Skipping bedset creation. Create_bedset is set to {create_bedset}"
)
_LOGGER.info(f"Failed samples: {failed_samples}")


def main(test_args: dict = None) -> NoReturn:
Expand All @@ -423,28 +446,30 @@ def main(test_args: dict = None) -> NoReturn:
or "test_outfolder",
)
pm_out_folder = os.path.join(os.path.abspath(pm_out_folder[0]), "pipeline_manager")

pm = pypiper.PipelineManager(
name="bedboss-pipeline",
outfolder=pm_out_folder,
version=__version__,
args=args,
# args=args,
multi=args_dict.get("multy", False),
recover=True,
)
if args_dict["command"] == "all":
run_all(pm=pm, **args_dict)
elif args_dict["command"] == "insert":
insert_pep(pm=pm, **args_dict)
elif args_dict["command"] == "make":
make_all(pm=pm, **args_dict)
make_all(**BedMakerCLIModel(pm=pm, **args_dict).model_dump())
elif args_dict["command"] == "qc":
bedqc(pm=pm, **args_dict)
bedqc(**BedQCCLIModel(pm=pm, **args_dict).model_dump())
elif args_dict["command"] == "stat":
bedstat(pm=pm, **args_dict)
bedstat(**BedStatCLIModel(pm=pm, **args_dict).model_dump())
elif args_dict["command"] == "bunch":
run_bedbuncher(pm=pm, **args_dict)
elif args_dict["command"] == "index":
add_to_qdrant(pm=pm, **args_dict)
elif args_dict["command"] == "requirements-check":
requirements_check()
else:
parser.print_help()
# raise Exception("Incorrect pipeline name.")
Expand Down
69 changes: 47 additions & 22 deletions bedboss/bedbuncher/bedbuncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
import pephubclient
from pephubclient.helpers import is_registry_path
import logging
from ubiquerg import parse_registry_path

from bedboss.const import (
DEFAULT_BEDBASE_API_URL,
DEFAULT_BEDBASE_CACHE_PATH,
OUTPUT_FOLDER_NAME,
BED_PEP_REGISTRY,
)


Expand All @@ -37,11 +38,14 @@ def create_bedset_from_pep(
_LOGGER.info("Creating bedset from pep.")
new_bedset = BedSet()
for bedfile_id in pep.samples:
bedfile_object = BBClient(
cache_folder=cache_folder,
bedbase_api=bedbase_api,
).load_bed(bedfile_id.get("record_identifier") or bedfile_id.sample_name)
new_bedset.add(bedfile_object)
try:
bedfile_object = BBClient(
cache_folder=cache_folder,
bedbase_api=bedbase_api,
).load_bed(bedfile_id.get("record_identifier") or bedfile_id.sample_name)
new_bedset.add(bedfile_object)
except Exception as err:
pass
_LOGGER.info("Bedset was created successfully")
return new_bedset

Expand Down Expand Up @@ -231,6 +235,7 @@ def run_bedbuncher(
bedbase_api: str = DEFAULT_BEDBASE_API_URL,
cache_path: str = DEFAULT_BEDBASE_CACHE_PATH,
heavy: bool = False,
upload_pephub: bool = False,
*args,
**kwargs,
) -> None:
Expand All @@ -244,6 +249,7 @@ def run_bedbuncher(
:param cache_path: path to the cache folder [DEFAULT: ./bedbase_cache]
:param heavy: whether to use heavy processing (add all columns to the database).
if False -> R-script won't be executed, only basic statistics will be calculated
:param upload_pephub: whether to upload bedset to pephub
:return: None
"""

Expand Down Expand Up @@ -278,22 +284,41 @@ def run_bedbuncher(
_LOGGER.warning(
f"Description for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
)

add_bedset_to_database(
bbc,
record_id=bedset_name or pep_of_bed.name,
bed_set=bedset,
bedset_name=bedset_name or pep_of_bed.name,
genome=dict(pep_of_bed.config.get("genome", {})),
description=pep_of_bed.description or "",
pephub_registry_path=pephub_registry_path,
heavy=heavy,
)
record_id = bedset_name or pep_of_bed.name
try:
add_bedset_to_database(
bbc,
record_id=record_id,
bed_set=bedset,
bedset_name=bedset_name or pep_of_bed.name,
genome=dict(pep_of_bed.config.get("genome", {})),
description=pep_of_bed.description or "",
# pephub_registry_path=pephub_registry_path,
heavy=heavy,
)
except Exception as err:
pass
if upload_pephub:
phc = pephubclient.PEPHubClient()
reg_path_obj = parse_registry_path(BED_PEP_REGISTRY)
bed_ids = [
sample.record_identifier
for sample in pep_of_bed.samples
if sample.get("record_identifier") is not None
]
print(bed_ids)
phc.view.create(
namespace=reg_path_obj["namespace"],
name=reg_path_obj["item"],
tag=reg_path_obj["tag"],
view_name=record_id,
sample_list=bed_ids,
)
return None


if __name__ == "__main__":
run_bedbuncher(
"/media/alex/Extreme SSD/databio/repos/bedbase_all/bedhost/bedbase_configuration_compose.yaml",
"databio/excluderanges:id3",
)
# if __name__ == "__main__":
# run_bedbuncher(
# "/media/alex/Extreme SSD/databio/repos/bedbase_all/bedhost/bedbase_configuration_compose.yaml",
# "databio/excluderanges:id3",
# )
18 changes: 11 additions & 7 deletions bedboss/bedmaker/bedmaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def __init__(
standardize: bool = False,
check_qc: bool = True,
pm: pypiper.PipelineManager = None,
**kwargs,
):
"""
Pypiper pipeline to convert supported file formats into
Expand Down Expand Up @@ -186,11 +185,16 @@ def make(self) -> dict:
# we need this exception to catch the case when the input file is not a bed file
bed_type, bed_format = get_bed_type(self.output_bed)
if self.check_qc:
bedqc(
self.output_bed,
outfolder=os.path.join(self.bed_parent, QC_FOLDER_NAME),
pm=self.pm,
)
try:
bedqc(
self.output_bed,
outfolder=os.path.join(self.bed_parent, QC_FOLDER_NAME),
pm=self.pm,
)
except Exception as e:
raise BedBossException(
f"Quality control failed for {self.output_bed}. Error: {e}"
)

self.make_bigbed(bed_type=bed_type)

Expand Down Expand Up @@ -356,7 +360,7 @@ def copy_with_standardization(self):
except (pd.errors.ParserError, pd.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
if not df:
if not isinstance(df, pd.DataFrame):
raise BedBossException(
reason=f"Bed file is broken and could not be parsed due to CSV parse error."
)
Expand Down
3 changes: 1 addition & 2 deletions bedboss/bedqc/bedqc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def bedqc(
max_region_number: int = MAX_REGION_NUMBER,
min_region_width: int = MIN_REGION_WIDTH,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> bool:
"""
Perform quality checks on a BED file.
Expand All @@ -30,9 +29,9 @@ def bedqc(
:param min_region_width: Minimum region width threshold to pass the quality check.
:param pm: Pypiper object for managing pipeline operations.
:return: True if the file passes the quality check.
:raises QualityException: if the file does not pass the quality
"""
_LOGGER.info("Running bedqc...")
_LOGGER.warning(f"Unused arguments: {kwargs}")

output_file = os.path.join(outfolder, "failed_qc.csv")
bedfile_name = os.path.basename(bedfile)
Expand Down
1 change: 0 additions & 1 deletion bedboss/bedstat/bedstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def bedstat(
open_signal_matrix: str = None,
just_db_commit: bool = False,
pm: pypiper.PipelineManager = None,
**kwargs,
) -> dict:
"""
Run bedstat pipeline - pipeline for obtaining statistics about bed files
Expand Down
Loading

0 comments on commit 884d99b

Please sign in to comment.