Fixed #20

databio · Nov 1, 2023 · b086146 · b086146
1 parent a00c5b7
commit b086146
Show file tree

Hide file tree

Showing 19 changed files with 536 additions and 28 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
+.ruff_cache/
 build/
 develop-eggs/
 dist/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+  # Run the Ruff linter.
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.3
+    hooks:
+      # Run the Ruff linter.
+      - id: ruff
+      # Run the Ruff formatter.
+      - id: ruff-format
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -5,4 +5,6 @@ include bedboss/bedstat/*
 include bedboss/bedstat/tools/*
 include bedboss/bedmaker/*
 include bedboss/bedqc/*
-include bedboss/qdrant_index/*
+include bedboss/qdrant_index/*
+include bedboss/bedbuncher/*
+include bedboss/bedbuncher/tools/*
diff --git a/README.md b/README.md
@@ -30,4 +30,4 @@ Calculates statistics about BED files.
 
 Detailed information about each pipeline can be found in the [bedboss Readme](./docs/README.md).
 
-For the specific bedbase.org instance, see instructions in the bedbase.org repo.
+For the specific bedbase.org instance, see instructions in the bedbase.org repo.
diff --git a/bedboss/__init__.py b/bedboss/__init__.py
@@ -2,13 +2,20 @@
 import logmuse
 import coloredlogs
 
-from bedboss import *
+# from bedboss import *
 
 # from bedboss.bedqc.bedqc import bedqc
 # from bedboss.bedmaker.bedmaker import BedMaker
 # from bedboss.bedstat.bedstat import bedstat
 from bedboss._version import __version__
-from bedboss.bedboss import run_all, run_all_by_pep, bedqc, BedMaker, bedstat
+from bedboss.bedboss import (
+    run_all,
+    run_all_by_pep,
+    bedqc,
+    BedMaker,
+    bedstat,
+    run_bedbuncher,
+)
 
 
 __package_name__ = "bedboss"
@@ -33,6 +40,7 @@
     "bedstat",
     "run_all",
     "run_all_by_pep",
+    "run_bedbuncher",
 ]
 
 _LOGGER = logmuse.init_logger("bedboss")

diff --git a/bedboss/_version.py b/bedboss/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0a4"
+__version__ = "0.1.0a5"
diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -1,8 +1,7 @@
 import logging
 import os
-from typing import NoReturn, Union, Dict
+from typing import NoReturn, Union
 
-import peppy
 import pypiper
 from argparse import Namespace
 import logmuse
@@ -11,6 +10,7 @@
 from bedboss.bedstat.bedstat import bedstat
 from bedboss.bedmaker.bedmaker import BedMaker
 from bedboss.bedqc.bedqc import bedqc
+from bedboss.bedbuncher import run_bedbuncher
 from bedboss.qdrant_index import add_to_qdrant
 from bedboss.cli import build_argparser
 from bedboss.const import (
@@ -42,7 +42,7 @@ def get_osm_path(genome: str) -> Union[str, None]:
     :return: path to the Open Signal Matrix
     """
     # TODO: add more osm
-    _LOGGER.info(f"Getting Open Signal Matrix file path...")
+    _LOGGER.info("Getting Open Signal Matrix file path...")
     if genome == "hg19" or genome == "GRCh37":
         osm_name = OS_HG19
     elif genome == "hg38" or genome == "GRCh38":
@@ -243,6 +243,8 @@ def main(test_args: dict = None) -> NoReturn:
         bedqc(pm=pm, **args_dict)
     elif args_dict["command"] == "stat":
         bedstat(pm=pm, **args_dict)
+    elif args_dict["command"] == "bunch":
+        run_bedbuncher(pm=pm, **args_dict)
     elif args_dict["command"] == "index":
         add_to_qdrant(pm=pm, **args_dict)
     else:

diff --git a/bedboss/bedbuncher/__init__.py b/bedboss/bedbuncher/__init__.py
@@ -0,0 +1,3 @@
+from bedboss.bedbuncher.bedbuncher import run_bedbuncher
+
+__all__ = ["run_bedbuncher"]
diff --git a/bedboss/bedbuncher/bedbuncher.py b/bedboss/bedbuncher/bedbuncher.py
@@ -0,0 +1,267 @@
+from geniml.io import BedSet
+from bbconf import BedBaseConf
+from bbconf.const import CFG_PATH_KEY, CFG_PATH_BEDBUNCHER_DIR_KEY
+from geniml.bbclient import BBClient
+from sqlmodel import select, func, Numeric, Float
+import os
+import json
+import subprocess
+import peppy
+import pephubclient
+from pephubclient.helpers import is_registry_path
+import logging
+
+from bedboss.const import DEFAULT_BEDBASE_API_URL, DEFAULT_BEDBASE_CACHE_PATH
+
+
+_LOGGER = logging.getLogger("bedboss")
+
+
+def create_bedset_from_pep(
+    pep: peppy.Project, bedbase_api: str, cache_folder: str = DEFAULT_BEDBASE_CACHE_PATH
+) -> BedSet:
+    """
+    Create bedset from pep file, where sample_name is bed identifier
+
+    :param pep:
+    :param bedbase_api:
+    :param cache_folder:
+    :return:
+    """
+    new_bedset = BedSet()
+    for bedfile_id in pep.samples:
+        bedfile_object = BBClient(
+            cache_folder=cache_folder,
+            bedbase_api=bedbase_api,
+        ).load_bed(bedfile_id.sample_name)
+        new_bedset.add(bedfile_object)
+    return new_bedset
+
+
+def calculate_bedset_statistics(bbc: BedBaseConf, bedset: BedSet) -> dict:
+    """
+    Calculate mean and standard deviation for each numeric column of bedfiles in bedset
+
+    :param bbc: BedBase configuration object
+    :param bedset: Bedset object
+    :return: dict with mean and standard deviation for each
+        {"sd": {"column_name": sd_value},
+         "mean": {"column_name": mean_value}}
+    """
+
+    numeric_columns = [
+        column
+        for column, value in bbc.bed.result_schemas.items()
+        if value["type"] == "number"
+    ]
+    list_of_samples = [sample.identifier for sample in bedset]
+
+    results_dict = {"mean": {}, "sd": {}}
+
+    for column_name in numeric_columns:
+        with bbc.bed.backend.session as s:
+            mean_bedset_statement = select(
+                func.round(
+                    func.avg(getattr(bbc.BedfileORM, column_name)).cast(Numeric), 4
+                ).cast(Float)
+            ).where(bbc.BedfileORM.record_identifier.in_(list_of_samples))
+            sd_bedset_statement = select(
+                func.round(
+                    func.stddev(getattr(bbc.BedfileORM, column_name)).cast(Numeric), 4
+                ).cast(Float)
+            ).where(bbc.BedfileORM.record_identifier.in_(list_of_samples))
+
+            results_dict["mean"][column_name] = s.exec(mean_bedset_statement).one()
+            results_dict["sd"][column_name] = s.exec(sd_bedset_statement).one()
+
+    return results_dict
+
+    # # Another way to do it, but it's slower:
+    # results_dict = {}
+    # results = bbc.bed.retrieve(record_identifier=list_of_samples, result_identifier=int_col)["records"]
+    # for sample in results:
+    #     for stat_value_dict in sample.values():
+    #         for key, value in stat_value_dict.items():
+    #             if key in results_dict:
+    #                 results_dict[key].append(value)
+    #             else:
+    #                 results_dict[key] = [value]
+
+
+def create_bed_list_file(bedset: BedSet, file_path: str) -> None:
+    """
+    Create a file with bed_set_list (Later this file is used in R script)
+
+    :param bedset: bed_set object
+    :param file_path: path to the file
+    :return: None
+    """
+    list_of_samples = [sample.path for sample in bedset]
+
+    with open(file_path, "w") as f:
+        for sample in list_of_samples:
+            f.write(sample + "\n")
+
+    return None
+
+
+def create_plots(
+    bbc: BedBaseConf,
+    bedset: BedSet,
+    bedset_name: str,
+) -> dict:
+    """
+    Create plots for a bedset (commonality region plot)
+
+    :param bbc: BedBaseConf object
+    :param bedset: Bedset object
+    :param bedset_name: bed_set name
+    :return: dict with information about crated plots
+    """
+    bedset_md5sum = bedset.bedset_identifier
+
+    output_folder = os.path.abspath(
+        bbc.config[CFG_PATH_KEY][CFG_PATH_BEDBUNCHER_DIR_KEY]
+    )
+    # if output folder doesn't exist create it
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    bedset_list_path = os.path.join(output_folder, f"{bedset_md5sum}_bedset.txt")
+    create_bed_list_file(bedset, bedset_list_path)
+    rscript_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        "bedbuncher",
+        "tools",
+        "bedsetStat.R",
+    )
+    assert os.path.exists(rscript_path), FileNotFoundError(
+        f"'{rscript_path}' script not found"
+    )
+
+    json_file_path = os.path.join(output_folder, bedset_md5sum + ".json")
+    command = (
+        f"Rscript {rscript_path} --outputfolder={output_folder} "
+        f"--bedfilelist={bedset_list_path} --id={bedset_md5sum} "
+        f"--json={json_file_path}"
+    )
+
+    subprocess.run(command, shell=True)
+
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        bedset_summary_info = json.loads(f.read())
+
+    os.remove(bedset_list_path)
+    os.remove(json_file_path)
+    return bedset_summary_info["plots"][0]
+
+
+def add_bedset_to_database(
+    bbc: BedBaseConf,
+    record_id: str,
+    bed_set: BedSet,
+    bedset_name: str,
+    genome: dict = None,
+    description: str = None,
+    heavy: bool = False,
+) -> None:
+    """
+    Add bedset to the database
+
+    :param bbc: BedBaseConf object
+    :param record_id: record identifier to be used in database
+    :param bed_set: Bedset object
+    :param bedset_name: Bedset name
+    :param genome: genome of the bedset
+    :param description: Bedset description
+    :param heavy: whether to use heavy processing (add all columns to the database).
+        if False -> R-script won't be executed, only basic statistics will be calculated
+    :return:
+    """
+    if not bedset_name:
+        raise ValueError(
+            "bedset_name was not provided correctly. Please provide it in pep name or as argument"
+        )
+
+    bed_set_stats = calculate_bedset_statistics(bbc, bed_set)
+    result_dict = {
+        "name": bedset_name,
+        "md5sum": bed_set.bedset_identifier,
+        "description": description,
+        "genome": genome,
+        "bedset_standard_deviation": bed_set_stats["sd"],
+        "bedset_means": bed_set_stats["mean"],
+        "processed": heavy,
+    }
+
+    if heavy:
+        plot_value = create_plots(bbc, bedset=bed_set, bedset_name=record_id)
+        result_dict["region_commonality"] = plot_value
+    else:
+        _LOGGER.warning("Heavy processing is False. Plots won't be calculated")
+
+    bbc.bedset.report(
+        record_identifier=record_id,
+        values=result_dict,
+        force_overwrite=True,
+    )
+    for sample in bed_set:
+        bbc.report_relationship(record_id, sample.identifier)
+
+
+def run_bedbuncher(
+    bedbase_config: str,
+    bedset_pep: str,
+    bedset_name: str = None,
+    bedbase_api: str = DEFAULT_BEDBASE_API_URL,
+    cache_path: str = DEFAULT_BEDBASE_CACHE_PATH,
+    heavy: bool = False,
+    *args,
+    **kwargs,
+) -> None:
+    """
+    Create bedset using file with a list of bedfiles
+
+    :param bedbase_config: bed base configuration file path
+    :param bedset_name: name of the bedset, can be provided here or as pep name
+    :param bedset_pep: bedset pep path or pephub registry path containing bedset pep
+    :param bedbase_api: bedbase api url [DEFAULT: http://localhost:8000/api]
+    :param cache_path: path to the cache folder [DEFAULT: ./bedbase_cache]
+    :param heavy: whether to use heavy processing (add all columns to the database).
+        if False -> R-script won't be executed, only basic statistics will be calculated
+    :return: None
+    """
+
+    bbc = BedBaseConf(bedbase_config)
+    if is_registry_path(bedset_pep):
+        pep_of_bed = pephubclient.PEPHubClient().load_project(bedset_pep)
+        bedset_record_id = bedset_pep
+    else:
+        pep_of_bed = peppy.Project(bedset_pep)
+        bedset_record_id = os.path.basename(bedset_pep)
+
+    bedset = create_bedset_from_pep(
+        pep=pep_of_bed, bedbase_api=bedbase_api, cache_folder=cache_path
+    )
+
+    if not pep_of_bed.config.get("genome"):
+        _LOGGER.warning(
+            f"Genome for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
+        )
+    if not pep_of_bed.get("description"):
+        _LOGGER.warning(
+            f"Description for bedset {bedset_name or pep_of_bed.get('name')} was not provided."
+        )
+
+    add_bedset_to_database(
+        bbc,
+        record_id=bedset_record_id,
+        bed_set=bedset,
+        bedset_name=bedset_name or pep_of_bed.get("name"),
+        genome=dict(pep_of_bed.config.get("genome", {})),
+        description=pep_of_bed.description or "",
+        heavy=heavy,
+    )
+    _LOGGER.info(
+        f"bedset {bedset_name or pep_of_bed.get('name')} was added successfully to the database"
+    )
+    return None
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ __pycache__/ @@
     # Distribution / packaging
     .Python
+    .ruff_cache/
     build/
     develop-eggs/
     dist/
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,4 +30,4 @@ Calculates statistics about BED files.

		Detailed information about each pipeline can be found in the [bedboss Readme](./docs/README.md).

		For the specific bedbase.org instance, see instructions in the bedbase.org repo.
		For the specific bedbase.org instance, see instructions in the bedbase.org repo.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from bedboss.bedbuncher.bedbuncher import run_bedbuncher

		__all__ = ["run_bedbuncher"]