diff --git a/MANIFEST.in b/MANIFEST.in index 3de398b..1c82bfe 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,4 +4,5 @@ include bedboss/* include bedboss/bedstat/* include bedboss/bedstat/tools/* include bedboss/bedmaker/* -include bedboss/bedqc/* \ No newline at end of file +include bedboss/bedqc/* +include bedboss/qdrant_index/* \ No newline at end of file diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index c00776d..ed0fb4e 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -11,6 +11,7 @@ from bedboss.bedstat.bedstat import bedstat from bedboss.bedmaker.bedmaker import BedMaker from bedboss.bedqc.bedqc import bedqc +from bedboss.qdrant_index import add_to_qdrant from bedboss.cli import build_argparser from bedboss.const import ( OS_HG19, @@ -234,14 +235,16 @@ def main(test_args: dict = None) -> NoReturn: ) if args_dict["command"] == "all": run_all(pm=pm, **args_dict) + elif args_dict["command"] == "all-pep": + run_all_by_pep(args_dict["pep_config"]) elif args_dict["command"] == "make": BedMaker(pm=pm, **args_dict) elif args_dict["command"] == "qc": bedqc(pm=pm, **args_dict) elif args_dict["command"] == "stat": bedstat(pm=pm, **args_dict) - elif args_dict["command"] == "all-pep": - run_all_by_pep(args_dict["pep_config"]) + elif args_dict["command"] == "index": + add_to_qdrant(pm=pm, **args_dict) else: parser.print_help() # raise Exception("Incorrect pipeline name.") diff --git a/bedboss/bedstat/bedstat.py b/bedboss/bedstat/bedstat.py index d2673a8..fd07925 100755 --- a/bedboss/bedstat/bedstat.py +++ b/bedboss/bedstat/bedstat.py @@ -254,7 +254,7 @@ def bedstat( if not skip_qdrant: bbc.add_bed_to_qdrant( bed_id=bed_digest, - bed_file_path=bedfile, + bed_file=bedfile, payload={"fileid": fileid}, ) bbc.bed.report( diff --git a/bedboss/cli.py b/bedboss/cli.py index af29106..a41f3e3 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -3,11 +3,13 @@ import logmuse from bedboss._version import __version__ +from bedboss.const import DEFAULT_BEDBASE_API_URL def build_argparser() -> ArgumentParser: """ BEDboss parser + :retrun: Tuple[pipeline, arguments] """ parser = VersionInHelpParser( @@ -38,6 +40,11 @@ def build_argparser() -> ArgumentParser: help="A pipeline to read a file in BED format and produce metadata " "in JSON format.", ) + + sub_index = subparser.add_parser( + "index", help="Index not indexed bed files and add them to the qdrant database " + ) + sub_all.add_argument( "--outfolder", required=True, @@ -318,4 +325,21 @@ def build_argparser() -> ArgumentParser: help="whether just to commit the JSON to the database", ) + sub_index.add_argument( + "--bedbase-config", + dest="bedbase_config", + type=str, + required=True, + help="a path to the bedbase configuration file [Required]", + ) + + sub_index.add_argument( + "--bedbase-api", + dest="bedbase_api", + type=str, + required=False, + default=DEFAULT_BEDBASE_API_URL, + help=f"URL of the Bedbase API [Default: {DEFAULT_BEDBASE_API_URL}]", + ) + return logmuse.add_logging_options(parser) diff --git a/bedboss/const.py b/bedboss/const.py index 8dc6285..a68a1d0 100644 --- a/bedboss/const.py +++ b/bedboss/const.py @@ -1,3 +1,5 @@ +DEFAULT_BEDBASE_API_URL = "https://bedbase.org/api" + OPEN_SIGNAL_FOLDER = "./openSignalMatrix" OPEN_SIGNAL_URL = "http://big.databio.org/open_chromatin_matrix/" diff --git a/bedboss/qdrant_index/__init__.py b/bedboss/qdrant_index/__init__.py new file mode 100644 index 0000000..5825fc2 --- /dev/null +++ b/bedboss/qdrant_index/__init__.py @@ -0,0 +1,3 @@ +from bedboss.qdrant_index.qdrant_index import add_to_qdrant + +__all__ = ["add_to_qdrant"] diff --git a/bedboss/qdrant_index/qdrant_index.py b/bedboss/qdrant_index/qdrant_index.py new file mode 100644 index 0000000..58c6e38 --- /dev/null +++ b/bedboss/qdrant_index/qdrant_index.py @@ -0,0 +1,64 @@ +import logging +from typing import List +from bbconf import BedBaseConf +from geniml.bbclient import BBClient +from geniml.region2vec import Region2VecExModel + +from bedboss.const import DEFAULT_BEDBASE_API_URL + +_LOGGER = logging.getLogger("bedboss") + + +def get_unindexed_bed_files(bbc: BedBaseConf) -> List[str]: + """ + Get list of unindexed bed files from the bedbase + :return: list of record_identifiers of unindexed bed files + """ + result_list = bbc.bed.backend.select_txt( + columns=["record_identifier"], + filter_templ="""added_to_qdrant = false and (genome->>'alias') = 'hg38'""", + ) + return [result[0] for result in result_list] + + +def add_to_qdrant( + bedbase_config: str, + bedbase_api: str = DEFAULT_BEDBASE_API_URL, + **kwargs, +) -> None: + """ + Add unindexed bed files to qdrant + + :param bedbase_config: path to the bedbase configuration file + :param bedbase_api: URL of the Bedbase API + :return: None + """ + # get list of bed files + bbc = BedBaseConf(config_path=bedbase_config) + list_of_record_ids = get_unindexed_bed_files(bbc) + + if len(list_of_record_ids) == 0: + _LOGGER.info("No unindexed bed files found") + return None + + region_to_vec_obj = Region2VecExModel("databio/r2v-ChIP-atlas-hg38") + + for record_id in list_of_record_ids: + bedfile_object = BBClient( + cache_folder="~/bedbase_cache", bedbase_api=bedbase_api + ).load_bed(record_id) + + bbc.add_bed_to_qdrant( + bed_id=record_id, + bed_file=bedfile_object, + payload={"description": "test"}, + region_to_vec=region_to_vec_obj, + ) + + bbc.bed.report( + record_identifier=record_id, + values={"added_to_qdrant": True}, + force_overwrite=True, + ) + + return None