Skip to content

Commit

Permalink
Fixed #20
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi committed Nov 1, 2023
1 parent a00c5b7 commit b086146
Show file tree
Hide file tree
Showing 19 changed files with 536 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ __pycache__/

# Distribution / packaging
.Python
.ruff_cache/
build/
develop-eggs/
dist/
Expand Down
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
repos:
# Run the Ruff linter.
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.1.3
hooks:
# Run the Ruff linter.
- id: ruff
# Run the Ruff formatter.
- id: ruff-format
4 changes: 3 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ include bedboss/bedstat/*
include bedboss/bedstat/tools/*
include bedboss/bedmaker/*
include bedboss/bedqc/*
include bedboss/qdrant_index/*
include bedboss/qdrant_index/*
include bedboss/bedbuncher/*
include bedboss/bedbuncher/tools/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ Calculates statistics about BED files.

Detailed information about each pipeline can be found in the [bedboss Readme](./docs/README.md).

For the specific bedbase.org instance, see instructions in the bedbase.org repo.
For the specific bedbase.org instance, see instructions in the bedbase.org repo.
12 changes: 10 additions & 2 deletions bedboss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,20 @@
import logmuse
import coloredlogs

from bedboss import *
# from bedboss import *

# from bedboss.bedqc.bedqc import bedqc
# from bedboss.bedmaker.bedmaker import BedMaker
# from bedboss.bedstat.bedstat import bedstat
from bedboss._version import __version__
from bedboss.bedboss import run_all, run_all_by_pep, bedqc, BedMaker, bedstat
from bedboss.bedboss import (
run_all,
run_all_by_pep,
bedqc,
BedMaker,
bedstat,
run_bedbuncher,
)


__package_name__ = "bedboss"
Expand All @@ -33,6 +40,7 @@
"bedstat",
"run_all",
"run_all_by_pep",
"run_bedbuncher",
]

_LOGGER = logmuse.init_logger("bedboss")
Expand Down
2 changes: 1 addition & 1 deletion bedboss/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.0a4"
__version__ = "0.1.0a5"
8 changes: 5 additions & 3 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import logging
import os
from typing import NoReturn, Union, Dict
from typing import NoReturn, Union

import peppy
import pypiper
from argparse import Namespace
import logmuse
Expand All @@ -11,6 +10,7 @@
from bedboss.bedstat.bedstat import bedstat
from bedboss.bedmaker.bedmaker import BedMaker
from bedboss.bedqc.bedqc import bedqc
from bedboss.bedbuncher import run_bedbuncher
from bedboss.qdrant_index import add_to_qdrant
from bedboss.cli import build_argparser
from bedboss.const import (
Expand Down Expand Up @@ -42,7 +42,7 @@ def get_osm_path(genome: str) -> Union[str, None]:
:return: path to the Open Signal Matrix
"""
# TODO: add more osm
_LOGGER.info(f"Getting Open Signal Matrix file path...")
_LOGGER.info("Getting Open Signal Matrix file path...")
if genome == "hg19" or genome == "GRCh37":
osm_name = OS_HG19
elif genome == "hg38" or genome == "GRCh38":
Expand Down Expand Up @@ -243,6 +243,8 @@ def main(test_args: dict = None) -> NoReturn:
bedqc(pm=pm, **args_dict)
elif args_dict["command"] == "stat":
bedstat(pm=pm, **args_dict)
elif args_dict["command"] == "bunch":
run_bedbuncher(pm=pm, **args_dict)
elif args_dict["command"] == "index":
add_to_qdrant(pm=pm, **args_dict)
else:
Expand Down
3 changes: 3 additions & 0 deletions bedboss/bedbuncher/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from bedboss.bedbuncher.bedbuncher import run_bedbuncher

__all__ = ["run_bedbuncher"]
267 changes: 267 additions & 0 deletions bedboss/bedbuncher/bedbuncher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
from geniml.io import BedSet
from bbconf import BedBaseConf
from bbconf.const import CFG_PATH_KEY, CFG_PATH_BEDBUNCHER_DIR_KEY
from geniml.bbclient import BBClient
from sqlmodel import select, func, Numeric, Float
import os
import json
import subprocess
import peppy
import pephubclient
from pephubclient.helpers import is_registry_path
import logging

from bedboss.const import DEFAULT_BEDBASE_API_URL, DEFAULT_BEDBASE_CACHE_PATH


_LOGGER = logging.getLogger("bedboss")


def create_bedset_from_pep(
pep: peppy.Project, bedbase_api: str, cache_folder: str = DEFAULT_BEDBASE_CACHE_PATH
) -> BedSet:
"""
Create bedset from pep file, where sample_name is bed identifier
:param pep:
:param bedbase_api:
:param cache_folder:
:return:
"""
new_bedset = BedSet()
for bedfile_id in pep.samples:
bedfile_object = BBClient(
cache_folder=cache_folder,
bedbase_api=bedbase_api,
).load_bed(bedfile_id.sample_name)
new_bedset.add(bedfile_object)
return new_bedset


def calculate_bedset_statistics(bbc: BedBaseConf, bedset: BedSet) -> dict:
"""
Calculate mean and standard deviation for each numeric column of bedfiles in bedset
:param bbc: BedBase configuration object
:param bedset: Bedset object
:return: dict with mean and standard deviation for each
{"sd": {"column_name": sd_value},
"mean": {"column_name": mean_value}}
"""

numeric_columns = [
column
for column, value in bbc.bed.result_schemas.items()
if value["type"] == "number"
]
list_of_samples = [sample.identifier for sample in bedset]

results_dict = {"mean": {}, "sd": {}}

for column_name in numeric_columns:
with bbc.bed.backend.session as s:
mean_bedset_statement = select(
func.round(
func.avg(getattr(bbc.BedfileORM, column_name)).cast(Numeric), 4
).cast(Float)
).where(bbc.BedfileORM.record_identifier.in_(list_of_samples))
sd_bedset_statement = select(
func.round(
func.stddev(getattr(bbc.BedfileORM, column_name)).cast(Numeric), 4
).cast(Float)
).where(bbc.BedfileORM.record_identifier.in_(list_of_samples))

results_dict["mean"][column_name] = s.exec(mean_bedset_statement).one()
results_dict["sd"][column_name] = s.exec(sd_bedset_statement).one()

return results_dict

# # Another way to do it, but it's slower:
# results_dict = {}
# results = bbc.bed.retrieve(record_identifier=list_of_samples, result_identifier=int_col)["records"]
# for sample in results:
# for stat_value_dict in sample.values():
# for key, value in stat_value_dict.items():
# if key in results_dict:
# results_dict[key].append(value)
# else:
# results_dict[key] = [value]


def create_bed_list_file(bedset: BedSet, file_path: str) -> None:
"""
Create a file with bed_set_list (Later this file is used in R script)
:param bedset: bed_set object
:param file_path: path to the file
:return: None
"""
list_of_samples = [sample.path for sample in bedset]

with open(file_path, "w") as f:
for sample in list_of_samples:
f.write(sample + "\n")

return None


def create_plots(
bbc: BedBaseConf,
bedset: BedSet,
bedset_name: str,
) -> dict:
"""
Create plots for a bedset (commonality region plot)
:param bbc: BedBaseConf object
:param bedset: Bedset object
:param bedset_name: bed_set name
:return: dict with information about crated plots
"""
bedset_md5sum = bedset.bedset_identifier

output_folder = os.path.abspath(
bbc.config[CFG_PATH_KEY][CFG_PATH_BEDBUNCHER_DIR_KEY]
)
# if output folder doesn't exist create it
if not os.path.exists(output_folder):
os.makedirs(output_folder)
bedset_list_path = os.path.join(output_folder, f"{bedset_md5sum}_bedset.txt")
create_bed_list_file(bedset, bedset_list_path)
rscript_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"bedbuncher",
"tools",
"bedsetStat.R",
)
assert os.path.exists(rscript_path), FileNotFoundError(
f"'{rscript_path}' script not found"
)

json_file_path = os.path.join(output_folder, bedset_md5sum + ".json")
command = (
f"Rscript {rscript_path} --outputfolder={output_folder} "
f"--bedfilelist={bedset_list_path} --id={bedset_md5sum} "
f"--json={json_file_path}"
)

subprocess.run(command, shell=True)

with open(json_file_path, "r", encoding="utf-8") as f:
bedset_summary_info = json.loads(f.read())

os.remove(bedset_list_path)
os.remove(json_file_path)
return bedset_summary_info["plots"][0]


def add_bedset_to_database(
bbc: BedBaseConf,
record_id: str,
bed_set: BedSet,
bedset_name: str,
genome: dict = None,
description: str = None,
heavy: bool = False,
) -> None:
"""
Add bedset to the database
:param bbc: BedBaseConf object
:param record_id: record identifier to be used in database
:param bed_set: Bedset object
:param bedset_name: Bedset name
:param genome: genome of the bedset
:param description: Bedset description
:param heavy: whether to use heavy processing (add all columns to the database).
if False -> R-script won't be executed, only basic statistics will be calculated
:return:
"""
if not bedset_name:
raise ValueError(
"bedset_name was not provided correctly. Please provide it in pep name or as argument"
)

bed_set_stats = calculate_bedset_statistics(bbc, bed_set)
result_dict = {
"name": bedset_name,
"md5sum": bed_set.bedset_identifier,
"description": description,
"genome": genome,
"bedset_standard_deviation": bed_set_stats["sd"],
"bedset_means": bed_set_stats["mean"],
"processed": heavy,
}

if heavy:
plot_value = create_plots(bbc, bedset=bed_set, bedset_name=record_id)
result_dict["region_commonality"] = plot_value
else:
_LOGGER.warning("Heavy processing is False. Plots won't be calculated")

bbc.bedset.report(
record_identifier=record_id,
values=result_dict,
force_overwrite=True,
)
for sample in bed_set:
bbc.report_relationship(record_id, sample.identifier)


def run_bedbuncher(
bedbase_config: str,
bedset_pep: str,
bedset_name: str = None,
bedbase_api: str = DEFAULT_BEDBASE_API_URL,
cache_path: str = DEFAULT_BEDBASE_CACHE_PATH,
heavy: bool = False,
*args,
**kwargs,
) -> None:
"""
Create bedset using file with a list of bedfiles
:param bedbase_config: bed base configuration file path
:param bedset_name: name of the bedset, can be provided here or as pep name
:param bedset_pep: bedset pep path or pephub registry path containing bedset pep
:param bedbase_api: bedbase api url [DEFAULT: http://localhost:8000/api]
:param cache_path: path to the cache folder [DEFAULT: ./bedbase_cache]
:param heavy: whether to use heavy processing (add all columns to the database).
if False -> R-script won't be executed, only basic statistics will be calculated
:return: None
"""

bbc = BedBaseConf(bedbase_config)
if is_registry_path(bedset_pep):
pep_of_bed = pephubclient.PEPHubClient().load_project(bedset_pep)
bedset_record_id = bedset_pep
else:
pep_of_bed = peppy.Project(bedset_pep)
bedset_record_id = os.path.basename(bedset_pep)

bedset = create_bedset_from_pep(
pep=pep_of_bed, bedbase_api=bedbase_api, cache_folder=cache_path
)

if not pep_of_bed.config.get("genome"):
_LOGGER.warning(
f"Genome for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
)
if not pep_of_bed.get("description"):
_LOGGER.warning(
f"Description for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
)

add_bedset_to_database(
bbc,
record_id=bedset_record_id,
bed_set=bedset,
bedset_name=bedset_name or pep_of_bed.get("name"),
genome=dict(pep_of_bed.config.get("genome", {})),
description=pep_of_bed.description or "",
heavy=heavy,
)
_LOGGER.info(
f"bedset {bedset_name or pep_of_bed.get('name')} was added successfully to the database"
)
return None
Loading

0 comments on commit b086146

Please sign in to comment.