From 75c113db57a1c09443d186610e2da14f15b4d8c2 Mon Sep 17 00:00:00 2001 From: uddamvathanak Date: Thu, 24 Oct 2024 17:24:23 +0800 Subject: [PATCH] Updating the code base for NAR publication --- .gitignore | 0 .readthedocs.yaml | 0 README.md | 2 +- __pycache__/main.cpython-38.pyc | Bin __pycache__/setup.cpython-38.pyc | Bin __pycache__/test_file.cpython-38.pyc | Bin build/lib/discotoolkit/CELLiD.py | 335 +++++++++++++----- build/lib/discotoolkit/DiscoClass.py | 14 +- build/lib/discotoolkit/DownloadDiscoData.py | 314 ++++++++++------ build/lib/discotoolkit/GetMetadata.py | 233 ++++++++---- build/lib/discotoolkit/GlobalVariable.py | 18 +- build/lib/discotoolkit/Utilities.py | 74 ++++ build/lib/discotoolkit/__init__.py | 2 +- discotoolkit/CELLiD.py | 335 +++++++++++++----- discotoolkit/DiscoClass.py | 14 +- discotoolkit/DownloadDiscoData.py | 314 ++++++++++------ discotoolkit/GetMetadata.py | 233 ++++++++---- discotoolkit/GlobalVariable.py | 18 +- discotoolkit/Utilities.py | 74 ++++ discotoolkit/__init__.py | 2 +- .../__pycache__/CELLiD.cpython-311.pyc | Bin 0 -> 24971 bytes .../__pycache__/DiscoClass.cpython-311.pyc | Bin 0 -> 3209 bytes .../DownloadDiscoData.cpython-311.pyc | Bin 0 -> 3492 bytes .../__pycache__/GeneSearch.cpython-311.pyc | Bin 0 -> 6477 bytes .../__pycache__/GetMetadata.cpython-311.pyc | Bin 0 -> 13674 bytes .../GlobalVariable.cpython-311.pyc | Bin 0 -> 620 bytes .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 375 bytes docs/.DS_Store | Bin 0 -> 6148 bytes docs/CELLiD_celltype_annotation.ipynb | 0 docs/Gene_search.ipynb | 0 docs/download_data.ipynb | 0 docs/index.md | 2 +- docs/scEnrichment.ipynb | 0 setup.py | 2 +- 34 files changed, 1405 insertions(+), 581 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 .readthedocs.yaml mode change 100644 => 100755 __pycache__/main.cpython-38.pyc mode change 100644 => 100755 __pycache__/setup.cpython-38.pyc mode change 100644 => 100755 __pycache__/test_file.cpython-38.pyc create mode 100644 build/lib/discotoolkit/Utilities.py create mode 100644 discotoolkit/Utilities.py create mode 100644 discotoolkit/__pycache__/CELLiD.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/DiscoClass.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/DownloadDiscoData.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/GeneSearch.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/GetMetadata.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/GlobalVariable.cpython-311.pyc create mode 100644 discotoolkit/__pycache__/__init__.cpython-311.pyc create mode 100644 docs/.DS_Store mode change 100644 => 100755 docs/CELLiD_celltype_annotation.ipynb mode change 100644 => 100755 docs/Gene_search.ipynb mode change 100644 => 100755 docs/download_data.ipynb mode change 100644 => 100755 docs/scEnrichment.ipynb diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/.readthedocs.yaml b/.readthedocs.yaml old mode 100644 new mode 100755 diff --git a/README.md b/README.md index 7d5fbf2..94adc90 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ --> [![Documentation Status](https://readthedocs.org/projects/discotoolkit-py/badge/?version=latest)](https://discotoolkit-py.readthedocs.io/en/latest/?badge=latest) [![Downloads](https://static.pepy.tech/personalized-badge/discotoolkit?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/discotoolkit) [![PyPI version](https://img.shields.io/pypi/v/discotoolkit)](https://pypi.org/project/discotoolkit) -# DISCOtoolkit 1.1.3 +# DISCOtoolkit 1.1.4 DISCOtoolkit is an python package that allows users to access data and use the tools provided by the [DISCO database](https://www.immunesinglecell.org/). Read the documentation [DISCOtoolkit](https://discotoolkit-py.readthedocs.io/en/latest/). It provides the following functions: diff --git a/__pycache__/main.cpython-38.pyc b/__pycache__/main.cpython-38.pyc old mode 100644 new mode 100755 diff --git a/__pycache__/setup.cpython-38.pyc b/__pycache__/setup.cpython-38.pyc old mode 100644 new mode 100755 diff --git a/__pycache__/test_file.cpython-38.pyc b/__pycache__/test_file.cpython-38.pyc old mode 100644 new mode 100755 diff --git a/build/lib/discotoolkit/CELLiD.py b/build/lib/discotoolkit/CELLiD.py index 922c9de..d8b10b4 100644 --- a/build/lib/discotoolkit/CELLiD.py +++ b/build/lib/discotoolkit/CELLiD.py @@ -9,10 +9,11 @@ import hashlib import requests from scipy import stats + # from fast_fisher import fast_fisher_exact, odds_ratio from scipy.stats.contingency import odds_ratio -from joblib import Parallel, delayed # multiprocessing library -from pandarallel import pandarallel # multiprocessing library +from joblib import Parallel, delayed # multiprocessing library +from pandarallel import pandarallel # multiprocessing library pd.options.mode.chained_assignment = None @@ -24,17 +25,45 @@ """ CELLiD cell type annotation function block. """ -# get the total atlas from DISCO website for the user to select which atlas to use -def get_atlas(ref_data = None, ref_path = None): + +def generate_correlation_map(x, y): + """Correlate each n with each m. + + Parameters + ---------- + x : np.array + Shape N X T. + + y : np.array + Shape M X T. + + Returns + ------- + np.array + N X M array in which each element is a correlation coefficient. + + """ + mu_x = x.mean(1) + mu_y = y.mean(1) + n = x.shape[1] + if n != y.shape[1]: + raise ValueError("x and y must " + "have the same number of timepoints.") + s_x = x.std(1, ddof=n - 1) + s_y = y.std(1, ddof=n - 1) + cov = np.dot(x, y.T) - n * np.dot(mu_x[:, np.newaxis], mu_y[np.newaxis, :]) + return cov / np.dot(s_x[:, np.newaxis], s_y[np.newaxis, :]) + + +# get the total atlas from DISCO website for the user to select which atlas to use +def get_atlas(ref_data=None, ref_path=None): """get the all the atlas string from the DISCO website and return to the user Returns: List: return list of string - """ + """ # Download reference data and ref_deg if missing if ref_data is None: - # check for the data path if ref_path is None: ref_path = "DISCOtmp" @@ -42,19 +71,33 @@ def get_atlas(ref_data = None, ref_path = None): # if the path does not exist, we create a default directory if not os.path.exists(ref_path): os.mkdir(ref_path) - + # download reference data from the server in pickle extension and read as pandas dataframe if ref_data is None: - if not(os.path.exists(ref_path + "/ref_data.pkl")): + if not (os.path.exists(ref_path + "/ref_data.pkl")): logging.info("Downloading reference dataset...") - response = requests.get(url=prefix_disco_url +"/getRefPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "toolkit/getRef?type=pkl", timeout=timeout + ) open(ref_path + "/ref_data.pkl", "wb").write(response.content) - ref_data = pd.read_pickle(ref_path + "/ref_data.pkl", compression = {'method':'gzip','compresslevel':6}) + ref_data = pd.read_pickle( + ref_path + "/ref_data.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) all_atlas = [each.split("--")[1] for each in ref_data.columns] return list(set(all_atlas)) -def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = None, atlas : str = None, n_predict : int = 1, ref_path : str = None, ncores : int = 10): + +def CELLiD_cluster( + rna, + ref_data: pd.DataFrame = None, + ref_deg: pd.DataFrame = None, + atlas: str = None, + n_predict: int = 1, + ref_path: str = None, + ncores: int = 10, +): """Cell type annotation using reference data and compute the correlation between the user cell gene expression as compare to the reference data. The celltype with highest correlation will be concluded as the celltype @@ -73,20 +116,21 @@ def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = # initialize the pandarallel for parallel processing pandarallel.initialize(nb_workers=ncores) - + # check for the input data - if not(isinstance(rna, pd.DataFrame) or isinstance(rna, np.ndarray)): + if not (isinstance(rna, pd.DataFrame) or isinstance(rna, np.ndarray)): logging.error("the rna must be a pandas DataFrame or Numpy Array") # check for number of prediction of celltypes and set the largest number to 3 only if n_predict is not None: if n_predict > 3: - logging.info("Any value of n_predict that exceeds 3 will be automatically adjusted to 3.") + logging.info( + "Any value of n_predict that exceeds 3 will be automatically adjusted to 3." + ) n_predict = 3 - + # Download reference data and ref_deg if missing if (ref_data is None) or (ref_deg is None): - # check for the data path if ref_path is None: ref_path = "DISCOtmp" @@ -94,42 +138,60 @@ def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = # if the path does not exist, we create a default directory if not os.path.exists(ref_path): os.mkdir(ref_path) - + # download reference data from the server in pickle extension and read as pandas dataframe if ref_data is None: - if not(os.path.exists(ref_path + "/ref_data.pkl")): + if not (os.path.exists(ref_path + "/ref_data.pkl")): logging.info("Downloading reference dataset...") - response = requests.get(url=prefix_disco_url +"/getRefPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "toolkit/getRef?type=pkl", timeout=timeout + ) open(ref_path + "/ref_data.pkl", "wb").write(response.content) - ref_data = pd.read_pickle(ref_path + "/ref_data.pkl", compression = {'method':'gzip','compresslevel':6}) + ref_data = pd.read_pickle( + ref_path + "/ref_data.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) # similarly do the same for the DEG reference if ref_deg is None: - if not(os.path.exists(ref_path + "/ref_deg.pkl")): + if not (os.path.exists(ref_path + "/ref_deg.pkl")): logging.info("Downloading deg dataset...") - response = requests.get(url=prefix_disco_url +"/getRefDegPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "toolkit/getRefDeg?type=pkl", timeout=timeout + ) open(ref_path + "/ref_deg.pkl", "wb").write(response.content) - ref_deg = pd.read_pickle(ref_path + "/ref_deg.pkl", compression = {'method':'gzip','compresslevel':6}) - + ref_deg = pd.read_pickle( + ref_path + "/ref_deg.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) + #### # write list of atlas function # we can write a function to get all the atlas from disco to the users # subsetting to the selected atlas by the user if atlas is not None: - select_ref = [index for index, each in enumerate(ref_data.columns) if each.split("--")[1] in list(atlas)] + select_ref = [ + index + for index, each in enumerate(ref_data.columns) + if each.split("--")[1] in list(atlas) + ] ref_data = ref_data.iloc[:, select_ref] - + # get the intersection of the gene for the user data and the reference data genes = set.intersection(set(rna.index), set(ref_data.index)) # give an error as 2000 genes are very low if len(genes) <= 2000: - logging.error("Less than 2000 genes are shared between the input data and the reference dataset!") + logging.error( + "Less than 2000 genes are shared between the input data and the reference dataset!" + ) # give a warning to the user if len(genes) <= 5000: - logging.warning("The input data and reference dataset have a limited number of overlapping genes, which may potentially impact the accuracy of the CELLiD.") - + logging.warning( + "The input data and reference dataset have a limited number of overlapping genes, which may potentially impact the accuracy of the CELLiD." + ) + # subsetting the dataframe to the common genes across data and the reference rna = rna.loc[list(genes)] ref_data = ref_data.loc[list(genes)] @@ -137,7 +199,7 @@ def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = # nested apply function to compute correlation for each cell type between the user and reference database def each_ref_correlation(input, ref_data): input = list(input) - res = ref_data.parallel_apply(basic_correlation, input = input, axis = 0) + res = ref_data.parallel_apply(basic_correlation, input=input, axis=0) return res # apply the correlation to all the columns in user data @@ -148,9 +210,14 @@ def basic_correlation(ref, input): predicted = pd.Series(list(ref)).corr(pd.Series(list(input)), method="spearman") return predicted - + # get the predicted correlation as in pandas DataFrame - predicted_cell = rna.apply(each_ref_correlation, result_type = "expand", axis = 0, ref_data = ref_data) + # predicted_cell = pd.DataFrame( + # generate_correlation_map(ref_data.to_numpy().T, rna.to_numpy().T) + # ) + predicted_cell = rna.apply( + each_ref_correlation, result_type="expand", axis=0, ref_data=ref_data + ) predicted_cell = pd.DataFrame(predicted_cell) # set the index to be the celltype based on the reference_data @@ -161,60 +228,84 @@ def basic_correlation(ref, input): # define a function to apply to each column of the dataframe def get_top_5_indices(col): # get the indices of the top 5 values in the column - indices = np.where(col.rank(method='min', ascending=False) <= 5)[0] + indices = np.where(col.rank(method="min", ascending=False) <= 5)[0] # convert to numeric and return - return pd.Series(indices).astype('int') - - ct = predicted_cell.apply(get_top_5_indices) - - # ct = pd.DataFrame(predicted_cell.parallel_apply(lambda x: [index for index, each in enumerate(x.rank()) if each <= 5], axis = 0, result_type = "expand")) + return pd.Series(indices).astype("int") + ct = predicted_cell.apply(get_top_5_indices) # compute the correleration based on the genes set that is intersection between the user data and reference dataset + def second_correlation(i, ref_data, rna, ct): - ref = ref_data.iloc[:,ct[i]].copy() - g = set(ref_deg.iloc[check_in_list(ref_deg["group"], ref.columns)].copy()["gene"]) + ref = ref_data.iloc[:, list(ct[str(i)])].copy() + g = set( + ref_deg.iloc[check_in_list(ref_deg["group"], ref.columns)].copy()["gene"] + ) g = set.intersection(set(ref.index), g) ref = ref.loc[list(g)].copy() - input = rna.loc[list(g), i].copy() + # print(rna.loc[list(g), str(i)]) + input = rna.loc[list(g), str(i)].copy() # predict = ref.apply(lambda i: np.round(stats.spearmanr(pd.to_numeric(i), pd.to_numeric(input), nan_policy='omit')[0], 3)) - predict = ref.apply(lambda i: np.round(pd.Series(i).corr(pd.Series(input), method='spearman'), 3)) # trying with pandas correlation + predict = ref.apply( + lambda i: np.round( + pd.Series(i).corr(pd.Series(input), method="spearman"), 3 + ) + ) # trying with pandas correlation predict = pd.DataFrame(predict) predict.columns = ["cor"] - predict = predict.sort_values(["cor"], ascending = False) - res = [[each.split("--")[0], each.split("--")[1], predict["cor"][index], i] for index, each in enumerate(predict.index) if index < n_predict] - res = pd.DataFrame(res, columns =['cell_type', 'atlas', "score", "input_index"]) + predict = predict.sort_values(["cor"], ascending=False) + res = [ + [each.split("--")[0], each.split("--")[1], predict["cor"][index], i] + for index, each in enumerate(predict.index) + if index < n_predict + ] + res = pd.DataFrame(res, columns=["cell_type", "atlas", "score", "input_index"]) return res # return list in the format cell_type, atlas, score, input_index # get a data in the format of cell_type, atlas, score, input_index - predicted_cell = Parallel(n_jobs=ncores, batch_size=32, verbose = 2)(delayed(second_correlation)(int(y), ref_data, rna, ct) for y in range(rna.shape[1])) + predicted_cell = Parallel(n_jobs=ncores, batch_size=32, verbose=2)( + delayed(second_correlation)(int(y), ref_data, rna, ct) + for y in range(rna.shape[1]) + ) # first concatenate in row wise manner predicted_cell = pd.concat(predicted_cell) - pivot = pd.pivot_table(predicted_cell, index='input_index', columns=predicted_cell.groupby('input_index').cumcount()+1, - values=['cell_type', 'atlas', 'score'], aggfunc='first') + pivot = pd.pivot_table( + predicted_cell, + index="input_index", + columns=predicted_cell.groupby("input_index").cumcount() + 1, + values=["cell_type", "atlas", "score"], + aggfunc="first", + ) - pivot = pivot[["cell_type", "atlas", "score"]] + pivot = pivot[["cell_type", "atlas", "score"]] # flatten multi-level column index - pivot.columns = ['_'.join(map(str, col)).strip() for col in pivot.columns.values] + pivot.columns = ["_".join(map(str, col)).strip() for col in pivot.columns.values] # rename columns column_names = {} for i in range(n_predict): - column_names[f'cell_type_{i+1}'] = f'predicted_cell_type_{i+1}' - column_names[f'atlas_{i+1}'] = f'source_atlas_{i+1}' - column_names[f'score_{i+1}'] = f'score_{i+1}' - + column_names[f"cell_type_{i+1}"] = f"predicted_cell_type_{i+1}" + column_names[f"atlas_{i+1}"] = f"source_atlas_{i+1}" + column_names[f"score_{i+1}"] = f"score_{i+1}" + pivot = pivot.rename(columns=column_names) - return pivot# return pandas dataframe in the format the same original R package + return pivot # return pandas dataframe in the format the same original R package + """ Geneset enrichment analysis using DISCO data """ -def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref_path : str = None, ncores : int = 1): + +def CELLiD_enrichment( + input: pd.DataFrame, + reference: pd.DataFrame = None, + ref_path: str = None, + ncores: int = 1, +): """Function to generate enrichment analysis based on the reference gene sets and following the DISCO pipeline. Args: @@ -229,7 +320,7 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # check input data # check if the input type is a DataFrame - if not(isinstance(input, pd.DataFrame)): + if not (isinstance(input, pd.DataFrame)): logging.error("The input must be a dataframe") # check if the user provide more than the needed column which are gene and fc @@ -240,7 +331,7 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref if input.shape[1] == 2: input_shape = 2 input.columns = ["gene", "fc"] - + # else we can just take a gene list for the enrichment else: input_shape = 1 @@ -248,7 +339,6 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # if the user does not provide reference geneset list if reference is None: - # check for the data path if it is provided by the user if ref_path is None: ref_path = "DISCOtmp" @@ -256,31 +346,45 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # if the path does not exist, we create a default directory if not os.path.exists(ref_path): os.mkdir(ref_path) - + # check if the file does not exist if not (os.path.exists(ref_path + "/ref_geneset.pkl")): # downloading the geneset data from disco database - response = requests.get(url=prefix_disco_url +"/getGeneSetPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "/getGeneSetPkl", timeout=timeout + ) open(ref_path + "/ref_geneset.pkl", "wb").write(response.content) - reference = pd.read_pickle(ref_path + "/ref_geneset.pkl", compression = {'method':'gzip','compresslevel':6}) + reference = pd.read_pickle( + ref_path + "/ref_geneset.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) else: # read the data into pandas dataframe for subsequent analysis - reference = pd.read_pickle(ref_path + "/ref_geneset.pkl", compression = {'method':'gzip','compresslevel':6}) + reference = pd.read_pickle( + ref_path + "/ref_geneset.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) # rename the name data to include the reference atlas reference["name"] = reference["name"] + " in " + reference["atlas"] - input["gene"] = input["gene"].str.upper() # convert all the gene to upper string as the reference gene name is in uppercase + input["gene"] = input[ + "gene" + ].str.upper() # convert all the gene to upper string as the reference gene name is in uppercase # condition when the user provide the fold change for the enrichment analysis if input_shape == 2: - input["fc"] = 2 ** input["fc"] # 2 to the power of fc - input = input.set_index("gene", drop = False) # set index to the gene - input = input.loc[list(set(reference["gene"]) & set(input['gene']))] # only get the common genes for comparison + input["fc"] = 2 ** input["fc"] # 2 to the power of fc + input = input.set_index("gene", drop=False) # set index to the gene + input = input.loc[ + list(set(reference["gene"]) & set(input["gene"])) + ] # only get the common genes for comparison # else we only look at the ranked gene list else: - input = input.set_index("gene", drop = False) - input = input.loc[list(set(reference["gene"]) & set(input['gene']))] # only get the common genes for comparison + input = input.set_index("gene", drop=False) + input = input.loc[ + list(set(reference["gene"]) & set(input["gene"])) + ] # only get the common genes for comparison # filter to get only the geneset that contain the input genes from the user # this might good different result from the website so lets remove it for # unique_names = reference[reference["gene"].isin(input["gene"])]["name"].unique() @@ -291,8 +395,8 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # optimise the CELLiD_enrichment function atlas_dict = {} name_dict = {} - reference_atlas = reference['atlas'] - reference_name = reference['name'] + reference_atlas = reference["atlas"] + reference_name = reference["name"] # pre compute the parameters of the fisher exact test for the reference data for rownum in range(len(reference)): @@ -305,57 +409,105 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref name_dict[reference_name[rownum]] = [rownum] else: name_dict[reference_name[rownum]].append(rownum) - + # apply function to the dataframe - def process_unique_name(reference_filter, input, atlas_dfs, input_shape, atlas_dict): + def process_unique_name( + reference_filter, input, atlas_dfs, input_shape, atlas_dict + ): # use the compile`d pattern to search for matches - unique_name = reference_filter['name'].head(1).values[0] - atlas = reference_filter['atlas'].head(1).values + unique_name = reference_filter["name"].head(1).values[0] + atlas = reference_filter["atlas"].head(1).values # print(unique_name) reference_full = atlas_dfs[list(atlas_dict.keys()).index(atlas)] # reference_filter = reference.iloc[name_dict[unique_name]].copy() # subset the reference data # reference_full = reference.iloc[atlas_dict[atlas]].copy() # full reference to the atlas - reference_filter = reference_filter.set_index(["gene"], drop = False) # set gene as the index + reference_filter = reference_filter.set_index( + ["gene"], drop=False + ) # set gene as the index # reference_filter["gene"] = reference_filter.index - input_filter = input.loc[list(set(reference_full["gene"]) & set(input["gene"]))] # subset the input genes + input_filter = input.loc[ + list(set(reference_full["gene"]) & set(input["gene"])) + ] # subset the input genes # condition for no passed gene set if input_filter.empty: return None - + # different computation to either include the fold change in the fisher exact test if input_shape == 2: - a = (reference_filter.loc[np.intersect1d(reference_filter["gene"], (input_filter["gene"]))].iloc[:, 0] * input_filter.loc[np.intersect1d(input_filter["gene"], reference_filter["gene"]), "fc"]).sum() + 1 - b = input_filter.loc[np.setdiff1d(input_filter["gene"], reference_filter["gene"]), "fc"].sum() + 1 - c = reference_filter.loc[np.setdiff1d(reference_filter["gene"], input_filter["gene"])].iloc[:, 0].sum() + 1 - d = len(set(reference_full["gene"])) - len(set(reference_filter["gene"]).union(set(input_filter["gene"]))) + a = ( + reference_filter.loc[ + np.intersect1d(reference_filter["gene"], (input_filter["gene"])) + ].iloc[:, 0] + * input_filter.loc[ + np.intersect1d(input_filter["gene"], reference_filter["gene"]), "fc" + ] + ).sum() + 1 + b = ( + input_filter.loc[ + np.setdiff1d(input_filter["gene"], reference_filter["gene"]), "fc" + ].sum() + + 1 + ) + c = ( + reference_filter.loc[ + np.setdiff1d(reference_filter["gene"], input_filter["gene"]) + ] + .iloc[:, 0] + .sum() + + 1 + ) + d = len(set(reference_full["gene"])) - len( + set(reference_filter["gene"]).union(set(input_filter["gene"])) + ) else: a = len(np.intersect1d(reference_filter["gene"], input_filter["gene"])) + 1 b = len(np.setdiff1d(input_filter["gene"], reference_filter["gene"])) + 1 c = len(np.setdiff1d(reference_filter["gene"], input_filter["gene"])) + 1 d = len(set(reference_full["gene"])) - a - b - c - + # get both value from the fisher exact test # pval = pvalue(a, b, c, d).two_tail _, pval = stats.fisher_exact(np.array([[a, b], [c, d]])) - odds = (a * d)/(b * c) + odds = (a * d) / (b * c) # only get the significant p_value # we can make changes to the p_value as in like we allow the user to specify it so that they have more control on the result they can obtain if pval < 0.01: - return pd.Series({"pval": pval, "or": odds, "name": unique_name, - "gene": ",".join(reference_filter.loc[reference_filter["gene"].isin(input_filter["gene"]), "gene"]), - "background": len(set(reference_full["gene"])), "overlap": len(reference_filter.loc[reference_filter["gene"].isin(input_filter["gene"])]), "geneset": len(reference_filter)}) + return pd.Series( + { + "pval": pval, + "or": odds, + "name": unique_name, + "gene": ",".join( + reference_filter.loc[ + reference_filter["gene"].isin(input_filter["gene"]), "gene" + ] + ), + "background": len(set(reference_full["gene"])), + "overlap": len( + reference_filter.loc[ + reference_filter["gene"].isin(input_filter["gene"]) + ] + ), + "geneset": len(reference_filter), + } + ) else: return None # Split them into cores unique_name_dfs = [reference.iloc[name_dict[key], :] for key in name_dict.keys()] atlas_dfs = [reference.iloc[atlas_dict[key], :] for key in atlas_dict.keys()] - + # return unique_name_dfs, atlas_dfs, input, input_shape # return unique_name_dfs, atlas_dfs # using joblib to apply multiprocessing - results = Parallel(n_jobs=ncores, verbose = 2, batch_size = 64)(delayed(process_unique_name)(unique_name_df, input, atlas_dfs, input_shape, atlas_dict) for unique_name_df in unique_name_dfs) + results = Parallel(n_jobs=ncores, verbose=2, batch_size=64)( + delayed(process_unique_name)( + unique_name_df, input, atlas_dfs, input_shape, atlas_dict + ) + for unique_name_df in unique_name_dfs + ) results = [res for res in results if res is not None] # concatenate the result into the dataframe to the user @@ -363,8 +515,9 @@ def process_unique_name(reference_filter, input, atlas_dfs, input_shape, atlas_d res_df = pd.concat(results, axis=1).transpose() res_df["pval"] = res_df["pval"].astype(float).round(3) res_df["or"] = res_df["or"].astype(float).round(3) - res_df = res_df.sort_values(by=["pval", "or"], ascending=[True, False]).head(min(50, len(results))) + res_df = res_df.sort_values(by=["pval", "or"], ascending=[True, False]).head( + min(50, len(results)) + ) return res_df else: return None - \ No newline at end of file diff --git a/build/lib/discotoolkit/DiscoClass.py b/build/lib/discotoolkit/DiscoClass.py index a055cb2..ab3d4fe 100644 --- a/build/lib/discotoolkit/DiscoClass.py +++ b/build/lib/discotoolkit/DiscoClass.py @@ -6,10 +6,10 @@ class Filter: """ Filter class object to save the attributes for filtering the dataset from DISCO - sample String e.g. GSM3891625_3; - project String; - tissue String e.g. Lung, Bladder; - disease String e.g. PDAC; + sample_id String e.g. ERX2757110; + project_id String; + tissue String e.g. lung, bladder; + disease String e.g. COVID-19; platform String e.g. 10x3'; sample_type String; cell_type String; @@ -20,12 +20,12 @@ class Filter: return Class object """ - def __init__(self, sample = None, project = None, tissue = None, disease = None, platform = None, sample_type = None, + def __init__(self, sample_id = None, project_id = None, tissue = None, disease = None, platform = None, sample_type = None, cell_type = None, cell_type_confidence : str = "medium", include_cell_type_children : bool = True, min_cell_per_sample : int = 100): # handling for string and list input - self.sample = self.convert_to_list(sample) # sample id - self.project = self.convert_to_list(project) # project, lab, or dataset from different author + self.sample_id = self.convert_to_list(sample_id) # sample id + self.project_id = self.convert_to_list(project_id) # project, lab, or dataset from different author self.tissue = self.convert_to_list(tissue) # organ tissue self.disease = self.convert_to_list(disease) # cancer or non cancer, or COVID-1e9 disease self.platform = self.convert_to_list(platform) # sequencing platform diff --git a/build/lib/discotoolkit/DownloadDiscoData.py b/build/lib/discotoolkit/DownloadDiscoData.py index 273986a..484c7d4 100644 --- a/build/lib/discotoolkit/DownloadDiscoData.py +++ b/build/lib/discotoolkit/DownloadDiscoData.py @@ -1,11 +1,11 @@ -''' -Descripttion: -version: +""" +Descripttion: +version: Author: Mengwei Li Date: 2023-04-14 16:33:44 LastEditors: Mengwei Li -LastEditTime: 2023-04-16 12:14:48 -''' +LastEditTime: 2024-10-23 21:30:03 +""" # import libraries import requests @@ -14,14 +14,15 @@ import re import os import hashlib +import scanpy as sc # import variable and class from other script from .GlobalVariable import logging, prefix_disco_url from .DiscoClass import FilterData, Filter from .GetMetadata import check_in_list -def download_disco_data(metadata, output_dir : str = "DISCOtmp"): +def download_disco_data(metadata, output_dir: str = "DISCOtmp"): """function to download the data based on the given filter Args: metadata (FilterData) : FilterData class to filters data from DISCO database @@ -29,116 +30,209 @@ def download_disco_data(metadata, output_dir : str = "DISCOtmp"): Returns: None: This function does not return any object and instead download the data for the user - """ + """ # define a list to store the error sample that can not be download error_sample = [] # create directory known as DISCOtmp or based on user defined directory if it does not exist - if not(os.path.exists(output_dir)): + if not (os.path.exists(output_dir)): os.mkdir(output_dir) + samples = list(metadata.sample_metadata["sample_id"]) # download data when the user does not want to subset to specific cell type - if metadata.filter.cell_type is None: - - # getting the dataframe of the samples - samples = metadata.sample_metadata - - # iterate through all the samples - for i in range(len(samples)): - s = list(samples["sampleId"])[i] - p = list(samples["projectId"])[i] - output_file = "%s/%s.h5ad" % (output_dir, s) # getting the name of the file - - # condition if the file has been downloaded before - if (os.path.exists(output_file)) and \ - hashlib.md5(open(output_file, "rb").read()).hexdigest() == list(samples["md5h5ad"])[i]: - logging.info(" %s has been downloaded before. Ignore ..." % (s)) # giving message to the user - else: - logging.info("Downloading data of %s" % (s)) # giving message to the user - - try: - # try to download the data as requested - response = requests.get(url=prefix_disco_url + "getH5adBySample?project="+ p +"&sample=" + s) - - # condition for error in downloading the samples - if response.status_code == 404: - error_sample.append(s) # append the error samples into a list for debugging later on - logging.warning("sample %s download fail" % (s)) # giving message to the user - else: - # write the file to directory - open(output_file, "wb").write(response.content) - - # condition for another error - if hashlib.md5(open(output_file, "rb").read()).hexdigest() != list(samples["md5h5ad"])[i]: - error_sample.append(s) - os.remove(output_file) - logging.warning("sample %s download fail" % (s)) # give message to the user - except: - # except error when the download fail - error_sample.append(s) - logging.warning("sample %s download fail" % (s)) - - # if there are error samples, we set all the attribute for FilterData object to the error samples and return the object in FilterData object - if len(error_sample) > 0: - metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") # change from inplace to this to increase efficiency - metadata.sample_metadata = metadata.sample_metadata.loc[error_sample] - metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[check_in_list(metadata.cell_type_metadata["sampleId"], error_sample)] - metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() - metadata.sample_count = len(error_sample) - return metadata - - else: - # get the dataframe for the metadata - metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") - samples = metadata.cell_type_metadata - concat_func = lambda x,y: x + "_" + str(y) - - # create new element by concatenating the sampleId and cluster - samples["sampleId"] = list(map(concat_func, list(samples["sampleId"]), - list(samples["cluster"]))) - - # similarly, iterative through all the sample for downloading - for i in range(len(samples)): - s = list(samples["sampleId"])[i] - sub_s = s.replace("_" + str(list(samples["cluster"])[i]) , "") # adding handler for celltype specific to get the data - p = metadata.sample_metadata.loc[sub_s]["projectId"] - output_file = "%s/%s.h5ad" % (output_dir, s) + + samples = metadata.sample_metadata + + for i in range(len(samples)): + s = list(samples["sample_id"])[i] + p = list(samples["project_id"])[i] + + output_file = "%s/%s.h5ad" % (output_dir, s) + if os.path.exists(output_file): + os.remove(output_file) + + url = prefix_disco_url + "download/getRawH5/" + p + "/" + s + rna = sc.read_10x_h5(filename=output_file, backup_url=url) + + cell = pd.read_csv( + prefix_disco_url + "toolkit/" + "getCellTypeSample?sampleId=" + s, + sep="\t", + header=0, + ) + # cell.set_index("cell_id") + cell = cell.iloc[:, [0, 2, 5]] + if metadata.filter.cell_type is not None: + if metadata.filter.cell_type_confidence == "high": + cell = cell.loc[cell["cell_type_score"] >= 0.8] + + elif metadata.filter.cell_type_confidence == "medium": + cell = cell.loc[cell["cell_type_score"] >= 0.6] + cell = cell.loc[cell.cell_type.isin(metadata.filter.cell_type)] - # checking for file and ignore if it is already exist - if (os.path.exists(output_file)) and \ - hashlib.md5(open(output_file, "rb").read()).hexdigest() == list(samples["h5adMd5"]): - logging.info(" %s has been downloaded before. Ignore ..." % (s)) # give message to the user - else: - logging.info("Downloading data of %s" % (s)) # give message to the user - try: - # get the data from the server - response = requests.get(url=prefix_disco_url + "getH5adBySampleCt?project="+ p +"&sample=" + s) - if response.status_code == 404: - error_sample.append(s) # append error samples - logging.warning("sample %s download fail 1" % (s)) # give message to the user - else: - open(output_file, "wb").write(response.content) - - # condition for another error - if hashlib.md5(open(output_file, "rb").read()).hexdigest() != list(samples["h5adMd5"])[i]: - error_sample.append(s) - os.remove(output_file) - logging.warning("sample %s download fail" % (s)) # give message to the user - except: - error_sample.append(s) - logging.warning("sample %s download fail" % (s)) # give message to the user - - # similarly, record the error samples and return to the variable - if len(error_sample) > 0: - metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[check_in_list(metadata.cell_type_metadata["sampleId"], error_sample)] - metadata.sample_metadata = metadata.sample_metadata.iloc[check_in_list(list(metadata.sample_metadata["sampleId"]), metadata.cell_type_metadata["sampleId"])] - metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() - metadata.sample_count = len(error_sample) - sample_cell_count = pd.DataFrame(metadata.cell_type_metadata.groupby(["sampleId"])["cellNumber"].agg("sum")) - sample_cell_count.columns = ["x"] - metadata.sample_metadata["cell_number"] = list(sample_cell_count.loc[list(metadata.sample_metadata["sampleId"])]["x"]) - return metadata - - # else return None as the data has been downloaded - return None \ No newline at end of file + rna = rna[cell.cell_id].copy() + rna.obs["cell_type"] = list(cell.cell_type) + rna.obs["sample_id"] = s + rna.obs["project_id"] = p + rna.write(output_file) + + # if metadata.filter.cell_type is None: + # # getting the dataframe of the samples + # samples = metadata.sample_metadata + + # # iterate through all the samples + # for i in range(len(samples)): + # s = list(samples["sampleId"])[i] + # p = list(samples["projectId"])[i] + # output_file = "%s/%s.h5ad" % (output_dir, s) # getting the name of the file + + # # condition if the file has been downloaded before + # if (os.path.exists(output_file)) and hashlib.md5( + # open(output_file, "rb").read() + # ).hexdigest() == list(samples["md5h5ad"])[i]: + # logging.info( + # " %s has been downloaded before. Ignore ..." % (s) + # ) # giving message to the user + # else: + # logging.info( + # "Downloading data of %s" % (s) + # ) # giving message to the user + + # try: + # # try to download the data as requested + # response = requests.get( + # url=prefix_disco_url + # + "getH5adBySample?project=" + # + p + # + "&sample=" + # + s + # ) + + # # condition for error in downloading the samples + # if response.status_code == 404: + # error_sample.append( + # s + # ) # append the error samples into a list for debugging later on + # logging.warning( + # "sample %s download fail" % (s) + # ) # giving message to the user + # else: + # # write the file to directory + # open(output_file, "wb").write(response.content) + + # # condition for another error + # if ( + # hashlib.md5(open(output_file, "rb").read()).hexdigest() + # != list(samples["md5h5ad"])[i] + # ): + # error_sample.append(s) + # os.remove(output_file) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + # except: + # # except error when the download fail + # error_sample.append(s) + # logging.warning("sample %s download fail" % (s)) + + # # if there are error samples, we set all the attribute for FilterData object to the error samples and return the object in FilterData object + # if len(error_sample) > 0: + # metadata.sample_metadata = metadata.sample_metadata.set_index( + # "sampleId" + # ) # change from inplace to this to increase efficiency + # metadata.sample_metadata = metadata.sample_metadata.loc[error_sample] + # metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[ + # check_in_list(metadata.cell_type_metadata["sampleId"], error_sample) + # ] + # metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() + # metadata.sample_count = len(error_sample) + # return metadata + + # else: + # # get the dataframe for the metadata + # metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") + # samples = metadata.cell_type_metadata + # concat_func = lambda x, y: x + "_" + str(y) + + # # create new element by concatenating the sampleId and cluster + # samples["sampleId"] = list( + # map(concat_func, list(samples["sampleId"]), list(samples["cluster"])) + # ) + + # # similarly, iterative through all the sample for downloading + # for i in range(len(samples)): + # s = list(samples["sampleId"])[i] + # sub_s = s.replace( + # "_" + str(list(samples["cluster"])[i]), "" + # ) # adding handler for celltype specific to get the data + # p = metadata.sample_metadata.loc[sub_s]["projectId"] + # output_file = "%s/%s.h5ad" % (output_dir, s) + + # # checking for file and ignore if it is already exist + # if (os.path.exists(output_file)) and hashlib.md5( + # open(output_file, "rb").read() + # ).hexdigest() == list(samples["h5adMd5"]): + # logging.info( + # " %s has been downloaded before. Ignore ..." % (s) + # ) # give message to the user + # else: + # logging.info("Downloading data of %s" % (s)) # give message to the user + # try: + # # get the data from the server + # response = requests.get( + # url=prefix_disco_url + # + "getH5adBySampleCt?project=" + # + p + # + "&sample=" + # + s + # ) + # if response.status_code == 404: + # error_sample.append(s) # append error samples + # logging.warning( + # "sample %s download fail 1" % (s) + # ) # give message to the user + # else: + # open(output_file, "wb").write(response.content) + + # # condition for another error + # if ( + # hashlib.md5(open(output_file, "rb").read()).hexdigest() + # != list(samples["h5adMd5"])[i] + # ): + # error_sample.append(s) + # os.remove(output_file) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + # except: + # error_sample.append(s) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + + # # similarly, record the error samples and return to the variable + # if len(error_sample) > 0: + # metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[ + # check_in_list(metadata.cell_type_metadata["sampleId"], error_sample) + # ] + # metadata.sample_metadata = metadata.sample_metadata.iloc[ + # check_in_list( + # list(metadata.sample_metadata["sampleId"]), + # metadata.cell_type_metadata["sampleId"], + # ) + # ] + # metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() + # metadata.sample_count = len(error_sample) + # sample_cell_count = pd.DataFrame( + # metadata.cell_type_metadata.groupby(["sampleId"])["cellNumber"].agg( + # "sum" + # ) + # ) + # sample_cell_count.columns = ["x"] + # metadata.sample_metadata["cell_number"] = list( + # sample_cell_count.loc[list(metadata.sample_metadata["sampleId"])]["x"] + # ) + # return metadata + + # # else return None as the data has been downloaded + # return None diff --git a/build/lib/discotoolkit/GetMetadata.py b/build/lib/discotoolkit/GetMetadata.py index d641320..82f681a 100644 --- a/build/lib/discotoolkit/GetMetadata.py +++ b/build/lib/discotoolkit/GetMetadata.py @@ -14,7 +14,8 @@ from .GlobalVariable import logging, prefix_disco_url from .DiscoClass import FilterData, Filter -def get_json(url : str, info_msg: str, error_msg: str, prefix : bool = True): + +def get_json(url: str, info_msg: str, error_msg: str, prefix: bool = True): """Funtion to get json data from the server Args: @@ -26,36 +27,37 @@ def get_json(url : str, info_msg: str, error_msg: str, prefix : bool = True): Returns: JSON: return JSON data which will be converted into pandas dataframe later on """ - + # giving log message to the user logging.info(info_msg) # condition for adding prefix - if prefix == False: + if not prefix: response = requests.get(url) else: - response = requests.get(prefix_disco_url + "/" + url) - + response = requests.get(prefix_disco_url + url) + # if the loading of JSON is successful, we get the text if response.status_code == 200: data = json.loads(response.text) - return data # return json data Dictionary datatype in python + return data # return json data Dictionary datatype in python else: - # print informative message to the user logging.error(error_msg) + def get_sample_ct_info(): - """ Get cell type information of sample + """Get cell type information of sample Returns: Pandas DataFrame: return pandas dataframe to the user - """ - temp = get_json(url = "/getSampleCtInfo", info_msg = "Retrieving cell type information of each sample from DISCO database", - error_msg = "Failed to retrieve cell type information. Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.") - return pd.DataFrame(temp) # return Dataframe of the JSON data + """ + temp = pd.read_csv( + prefix_disco_url + "toolkit/" + "getCellTypeSummary", sep="\t", header=0 + ) + return temp # return Dataframe of the JSON data -def find_celltype(term : str = "", cell_ontology : dict = None): +def find_celltype(term: str = "", cell_ontology: dict = None): """find the celltype within the disco dataset Args: @@ -68,9 +70,12 @@ def find_celltype(term : str = "", cell_ontology : dict = None): # when the user do not have cell ontology, we get the default one from disco database if cell_ontology is None: - cell_ontology = get_json(url = "/getCellOntology", info_msg = "Retrieving ontology from DISCO database", - error_msg = "Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.") - cell_type = pd.DataFrame(cell_ontology)["cell_name"] # convert to dataframe + cell_ontology = get_json( + url="toolkit/getCellOntology", + info_msg="Retrieving ontology from DISCO database", + error_msg="Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", + ) + cell_type = pd.DataFrame(cell_ontology)["cell_name"] # convert to dataframe # convert term to lower term for matching. the same apply for the cell_type term = term.lower() @@ -87,22 +92,30 @@ def find_celltype(term : str = "", cell_ontology : dict = None): # get the index and return the cell type in the correct case in the form of Python List return list(cell_type[idx_list]) -def get_disco_metadata(): +def get_disco_metadata(): """get metadata of the disco into the dataframe Returns: Pandas Dataframe: Disco metadata in the format of dataframe which will be used for the filter class - """ + """ - metadata = get_json(url = "http://www.immunesinglecell.org/api/vishuo/sample/all", info_msg = "Retrieving metadata from DISCO database", - error_msg = "Failed to retrieve metadata. Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", prefix=False) - metadata = pd.DataFrame(metadata) - metadata = metadata[metadata["processStatus"] == "QC pass"] # filtering to only get the sample that pass quality control - metadata.set_index("sampleId") # setting the index of the metadata to sample_id for consistency + metadata = pd.read_csv( + "https://immunesinglecell.org/disco_v3_api/toolkit/getSampleMetadata", + sep="\t", + header=0, + ) + # metadata = pd.DataFrame(metadata) + # metadata = metadata[ + # metadata["processStatus"] == "QC pass" + # ] # filtering to only get the sample that pass quality control + metadata.set_index( + "sample_id" + ) # setting the index of the metadata to sample_id for consistency return metadata -def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = None): + +def get_celltype_children(cell_type: Union[str, list], cell_ontology: dict = None): """get the children of the input celltype from the user Args: @@ -113,8 +126,13 @@ def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = No List of String: return the children of the defined celltype in list of String """ if cell_ontology is None: - cell_ontology = pd.DataFrame(get_json(url="/getCellOntology", info_msg = "Retrieving ontology from DISCO database", - error_msg = "Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.")) + cell_ontology = pd.DataFrame( + get_json( + url="toolkit/getCellOntology", + info_msg="Retrieving ontology from DISCO database", + error_msg="Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", + ) + ) # define an empty list children = [] @@ -129,13 +147,20 @@ def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = No # while loop to get all the children celltypes iteratively while len(cell_type) > 0: - idx_list = [index for index, each in enumerate(list(cell_ontology["parent"])) if each in cell_type] + idx_list = [ + index + for index, each in enumerate(list(cell_ontology["parent"])) + if each in cell_type + ] children.extend(list(cell_ontology["cell_name"][idx_list])) cell_type = list(cell_ontology["cell_name"][idx_list]) - children = list(set(children)) # ensure that the obtained children of celltype are unique + children = list( + set(children) + ) # ensure that the obtained children of celltype are unique return children + def list_metadata_item(field: str): """List element inside the metadata columns @@ -153,24 +178,25 @@ def list_metadata_item(field: str): if field in metadata.columns: return list(set(metadata[field])) else: - logging.warning("DISCO data don't contain '%s' field. Please check the field name" % (field)) + logging.warning( + "DISCO data don't contain '%s' field. Please check the field name" % (field) + ) return None - -def list_all_columns(): + +def list_all_columns(): """list all the columns found in the metadata of the disco database Returns: List: return the name of the metadata in the form of list of string - """ - + """ + # get the disco metadata metadata = get_disco_metadata() return list(metadata.columns) -def filter_disco_metadata(filter : Filter = Filter()): - +def filter_disco_metadata(filter: Filter = Filter()): """filter function option for the disco data Args: Filter (Class): predefined Filter class with default attribute to filter data for the user @@ -182,15 +208,17 @@ def filter_disco_metadata(filter : Filter = Filter()): # starting with defining variable filter_data = FilterData() - metadata = get_disco_metadata() # get metadata - logging.info("Filtering sample") # producing log message to the user + metadata = get_disco_metadata() # get metadata + logging.info("Filtering sample") # producing log message to the user # condition for each provided attribute by the user or default - if filter.sample is not None: - metadata = metadata.iloc[check_in_list(metadata["sampleId"], filter.sample)] + if filter.sample_id is not None: + metadata = metadata.iloc[check_in_list(metadata["sample_id"], filter.sample_id)] - if filter.project is not None: - metadata = metadata.iloc[check_in_list(metadata["projectId"], filter.project)] + if filter.project_id is not None: + metadata = metadata.iloc[ + check_in_list(metadata["project_id"], filter.project_id) + ] if filter.tissue is not None: metadata = metadata.iloc[check_in_list(metadata["tissue"], filter.tissue)] @@ -202,81 +230,130 @@ def filter_disco_metadata(filter : Filter = Filter()): metadata = metadata.iloc[check_in_list(metadata["disease"], filter.disease)] if filter.sample_type is not None: - metadata = metadata.iloc[check_in_list(metadata["sampleType"], filter.sample_type)] + metadata = metadata.iloc[ + check_in_list(metadata["sample_type"], filter.sample_type) + ] # condition when there is no samples with the provided filters if len(metadata) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - + # get the celltype information sample_ct_info = get_sample_ct_info() # remove the unnecessary fields from the metadata and only keep the below defined columns - retain_field = ["sampleId", "projectId", "sampleType", "anatomicalSite", "disease", - "tissue", "platform", "ageGroup", "age", "gender", "cellSorting", - "diseaseSubtype", "diseaseStage", "treatment", "md5h5ad"] - + # retain_field = [ + # "sample_id", + # "projectId", + # "sample_type", + # "anatomicalSite", + # "disease", + # "tissue", + # "platform", + # "ageGroup", + # "age", + # "gender", + # "cellSorting", + # "diseaseSubtype", + # "diseaseStage", + # "treatment", + # "md5h5ad", + # ] + # subset to the retained field - metadata = metadata[retain_field] + # metadata = metadata[retain_field] # checking for the specific cell type if filter.cell_type is None: - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] - sample_cell_count = pd.DataFrame(sample_ct_info.groupby(["sampleId"])["cellNumber"].agg("sum")) # get the total number of cells - sample_cell_count.columns = ["x"] # assigning different column name - filtered_index = np.intersect1d(sample_cell_count.index, list(metadata["sampleId"])) # getting the common sampleId from the ct info and metadata info - metadata = metadata.loc[metadata['sampleId'].isin(filtered_index)] # filtering to only the common sampleId - metadata["cell_number"] = list(sample_cell_count.loc[filtered_index]["x"]) # assigning to the metadata - metadata = metadata[metadata["cell_number"] > filter.min_cell_per_sample] # filter by the minimum cell per sample + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] + sample_cell_count = pd.DataFrame( + sample_ct_info.groupby(["sample_id"])["cell_number"].agg("sum") + ) # get the total number of cells + sample_cell_count.columns = ["x"] # assigning different column name + filtered_index = np.intersect1d( + sample_cell_count.index, list(metadata["sample_id"]) + ) # getting the common sample_id from the ct info and metadata info + metadata = metadata.loc[ + metadata["sample_id"].isin(filtered_index) + ] # filtering to only the common sample_id + metadata["cell_number"] = list( + sample_cell_count.loc[filtered_index]["x"] + ) # assigning to the metadata + metadata = metadata[ + metadata["cell_number"] > filter.min_cell_per_sample + ] # filter by the minimum cell per sample # condiition for none if len(metadata) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - # subsetting based on the sampleId - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] - + # subsetting based on the sample_id + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] + # now setting the attribute of the FilterData filter_data.sample_metadata = metadata filter_data.cell_type_metadata = sample_ct_info filter_data.sample_count = len(metadata) filter_data.cell_count = sum(metadata["cell_number"]) - filter_data.filter = filter # now set all the changes we made to the Filter object + filter_data.filter = ( + filter # now set all the changes we made to the Filter object + ) # giving the message to the user - logging.info("%s samples and %s cells were found" % (filter_data.sample_count, filter_data.cell_count)) + logging.info( + "%s samples and %s cells were found" + % (filter_data.sample_count, filter_data.cell_count) + ) return filter_data - + # condition to get all the data for the sub celltype if filter.include_cell_type_childen: filter.cell_type = get_celltype_children(filter.cell_type) # subset to the given cell type and sample - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["cellType"], filter.cell_type)] - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], list(metadata["sampleId"]))] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["cell_type"], filter.cell_type) + ] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], list(metadata["sample_id"])) + ] # condition for different values of the cell_type_confidence if filter.cell_type_confidence not in ["high", "medium", "all"]: - logging.warning("cell_type_confidence can only be high, medium, or all") # give the user an informative message + logging.warning( + "cell_type_confidence can only be high, medium, or all" + ) # give the user an informative message if filter.cell_type_confidence == "high": - sample_ct_info = sample_ct_info[sample_ct_info["cellTypeScore"] >= 0.8] + sample_ct_info = sample_ct_info[sample_ct_info["cell_type_score"] >= 0.8] elif filter.cell_type_confidence == "medium": - sample_ct_info = sample_ct_info[sample_ct_info["cellTypeScore"] >= 0.6] + sample_ct_info = sample_ct_info[sample_ct_info["cell_type_score"] >= 0.6] # again condition for no sample found and return None if len(sample_ct_info) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - + # follow from previous block of code - metadata = metadata.iloc[check_in_list(list(metadata["sampleId"]), sample_ct_info["sampleId"])] - sample_cell_count = pd.DataFrame(sample_ct_info.groupby(["sampleId"])["cellNumber"].agg("sum")) # total cell numbers + metadata = metadata.iloc[ + check_in_list(list(metadata["sample_id"]), sample_ct_info["sample_id"]) + ] + sample_cell_count = pd.DataFrame( + sample_ct_info.groupby(["sample_id"])["cell_number"].agg("sum") + ) # total cell numbers sample_cell_count.columns = ["x"] - - metadata["cell_number"] = list(sample_cell_count.loc[list(metadata["sampleId"])]["x"]) # count number of cell - metadata = metadata[metadata["cell_number"] > filter.min_cell_per_sample] # filter based on min cell per sample + + metadata["cell_number"] = list( + sample_cell_count.loc[list(metadata["sample_id"])]["x"] + ) # count number of cell + metadata = metadata[ + metadata["cell_number"] > filter.min_cell_per_sample + ] # filter based on min cell per sample # condition for None if len(metadata) == 0: @@ -284,7 +361,9 @@ def filter_disco_metadata(filter : Filter = Filter()): return None # subsetting before assign attribute for the FilterData object - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] # now assign all the values to the attribute of the object filter_data.sample_metadata = metadata @@ -292,9 +371,13 @@ def filter_disco_metadata(filter : Filter = Filter()): filter_data.sample_count = len(metadata) filter_data.cell_count = sum(metadata["cell_number"]) filter_data.filter = filter - logging.info("%s samples and %s cells were found" % (filter_data.sample_count, filter_data.cell_count)) + logging.info( + "%s samples and %s cells were found" + % (filter_data.sample_count, filter_data.cell_count) + ) return filter_data + # utils function for getting the index of the element on the left list for subsetting if the element is exist in the right list. def check_in_list(var, whitelist): - return [index for index, each in enumerate(list(var)) if each in list(whitelist)] \ No newline at end of file + return [index for index, each in enumerate(list(var)) if each in list(whitelist)] diff --git a/build/lib/discotoolkit/GlobalVariable.py b/build/lib/discotoolkit/GlobalVariable.py index b737a21..ec626ab 100644 --- a/build/lib/discotoolkit/GlobalVariable.py +++ b/build/lib/discotoolkit/GlobalVariable.py @@ -1,3 +1,11 @@ +''' +Descripttion: +version: +Author: Mengwei Li +Date: 2023-07-06 16:59:59 +LastEditors: Mengwei Li +LastEditTime: 2024-10-23 15:52:29 +''' """ Global variable file to import for the subsequent script. @@ -17,9 +25,9 @@ timeout = 600 # Define package-level variable -response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl") +# response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl") -if response.status_code == 200: - prefix_disco_url = json.loads(response.text)["url"] -else: - prefix_disco_url = "http://www.immunesinglecell.org/toolkitapi" \ No newline at end of file +# if response.status_code == 200: +# prefix_disco_url = json.loads(response.text)["url"] +# else: +prefix_disco_url = "https://immunesinglecell.org/disco_v3_api/" \ No newline at end of file diff --git a/build/lib/discotoolkit/Utilities.py b/build/lib/discotoolkit/Utilities.py new file mode 100644 index 0000000..1fd4b9c --- /dev/null +++ b/build/lib/discotoolkit/Utilities.py @@ -0,0 +1,74 @@ +import h5py +import numpy as np +from scipy.sparse import csr_matrix +from pathlib import Path + + +def write_10X_h5(adata, file): + """Writes adata to a 10X-formatted h5 file. + + Note that this function is not fully tested and may not work for all cases. + It will not write the following keys to the h5 file compared to 10X: + '_all_tag_keys', 'pattern', 'read', 'sequence' + + Args: + adata (AnnData object): AnnData object to be written. + file (str): File name to be written to. If no extension is given, '.h5' is appended. + + Raises: + FileExistsError: If file already exists. + + Returns: + None + """ + + if ".h5" not in file: + file = f"{file}.h5" + if Path(file).exists(): + raise FileExistsError(f"There already is a file `{file}`.") + + def int_max(x): + return int(max(np.floor(len(str(int(max(x)))) / 4), 1) * 4) + + def str_max(x): + return max([len(i) for i in x]) + + w = h5py.File(file, "w") + grp = w.create_group("matrix") + grp.create_dataset( + "barcodes", + data=np.array(adata.obs_names, dtype=f"|S{str_max(adata.obs_names)}"), + ) + grp.create_dataset( + "data", data=np.array(adata.X.data, dtype=f" 3: - logging.info("Any value of n_predict that exceeds 3 will be automatically adjusted to 3.") + logging.info( + "Any value of n_predict that exceeds 3 will be automatically adjusted to 3." + ) n_predict = 3 - + # Download reference data and ref_deg if missing if (ref_data is None) or (ref_deg is None): - # check for the data path if ref_path is None: ref_path = "DISCOtmp" @@ -94,42 +138,60 @@ def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = # if the path does not exist, we create a default directory if not os.path.exists(ref_path): os.mkdir(ref_path) - + # download reference data from the server in pickle extension and read as pandas dataframe if ref_data is None: - if not(os.path.exists(ref_path + "/ref_data.pkl")): + if not (os.path.exists(ref_path + "/ref_data.pkl")): logging.info("Downloading reference dataset...") - response = requests.get(url=prefix_disco_url +"/getRefPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "toolkit/getRef?type=pkl", timeout=timeout + ) open(ref_path + "/ref_data.pkl", "wb").write(response.content) - ref_data = pd.read_pickle(ref_path + "/ref_data.pkl", compression = {'method':'gzip','compresslevel':6}) + ref_data = pd.read_pickle( + ref_path + "/ref_data.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) # similarly do the same for the DEG reference if ref_deg is None: - if not(os.path.exists(ref_path + "/ref_deg.pkl")): + if not (os.path.exists(ref_path + "/ref_deg.pkl")): logging.info("Downloading deg dataset...") - response = requests.get(url=prefix_disco_url +"/getRefDegPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "toolkit/getRefDeg?type=pkl", timeout=timeout + ) open(ref_path + "/ref_deg.pkl", "wb").write(response.content) - ref_deg = pd.read_pickle(ref_path + "/ref_deg.pkl", compression = {'method':'gzip','compresslevel':6}) - + ref_deg = pd.read_pickle( + ref_path + "/ref_deg.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) + #### # write list of atlas function # we can write a function to get all the atlas from disco to the users # subsetting to the selected atlas by the user if atlas is not None: - select_ref = [index for index, each in enumerate(ref_data.columns) if each.split("--")[1] in list(atlas)] + select_ref = [ + index + for index, each in enumerate(ref_data.columns) + if each.split("--")[1] in list(atlas) + ] ref_data = ref_data.iloc[:, select_ref] - + # get the intersection of the gene for the user data and the reference data genes = set.intersection(set(rna.index), set(ref_data.index)) # give an error as 2000 genes are very low if len(genes) <= 2000: - logging.error("Less than 2000 genes are shared between the input data and the reference dataset!") + logging.error( + "Less than 2000 genes are shared between the input data and the reference dataset!" + ) # give a warning to the user if len(genes) <= 5000: - logging.warning("The input data and reference dataset have a limited number of overlapping genes, which may potentially impact the accuracy of the CELLiD.") - + logging.warning( + "The input data and reference dataset have a limited number of overlapping genes, which may potentially impact the accuracy of the CELLiD." + ) + # subsetting the dataframe to the common genes across data and the reference rna = rna.loc[list(genes)] ref_data = ref_data.loc[list(genes)] @@ -137,7 +199,7 @@ def CELLiD_cluster(rna, ref_data : pd.DataFrame = None, ref_deg : pd.DataFrame = # nested apply function to compute correlation for each cell type between the user and reference database def each_ref_correlation(input, ref_data): input = list(input) - res = ref_data.parallel_apply(basic_correlation, input = input, axis = 0) + res = ref_data.parallel_apply(basic_correlation, input=input, axis=0) return res # apply the correlation to all the columns in user data @@ -148,9 +210,14 @@ def basic_correlation(ref, input): predicted = pd.Series(list(ref)).corr(pd.Series(list(input)), method="spearman") return predicted - + # get the predicted correlation as in pandas DataFrame - predicted_cell = rna.apply(each_ref_correlation, result_type = "expand", axis = 0, ref_data = ref_data) + # predicted_cell = pd.DataFrame( + # generate_correlation_map(ref_data.to_numpy().T, rna.to_numpy().T) + # ) + predicted_cell = rna.apply( + each_ref_correlation, result_type="expand", axis=0, ref_data=ref_data + ) predicted_cell = pd.DataFrame(predicted_cell) # set the index to be the celltype based on the reference_data @@ -161,60 +228,84 @@ def basic_correlation(ref, input): # define a function to apply to each column of the dataframe def get_top_5_indices(col): # get the indices of the top 5 values in the column - indices = np.where(col.rank(method='min', ascending=False) <= 5)[0] + indices = np.where(col.rank(method="min", ascending=False) <= 5)[0] # convert to numeric and return - return pd.Series(indices).astype('int') - - ct = predicted_cell.apply(get_top_5_indices) - - # ct = pd.DataFrame(predicted_cell.parallel_apply(lambda x: [index for index, each in enumerate(x.rank()) if each <= 5], axis = 0, result_type = "expand")) + return pd.Series(indices).astype("int") + ct = predicted_cell.apply(get_top_5_indices) # compute the correleration based on the genes set that is intersection between the user data and reference dataset + def second_correlation(i, ref_data, rna, ct): - ref = ref_data.iloc[:,ct[i]].copy() - g = set(ref_deg.iloc[check_in_list(ref_deg["group"], ref.columns)].copy()["gene"]) + ref = ref_data.iloc[:, list(ct[str(i)])].copy() + g = set( + ref_deg.iloc[check_in_list(ref_deg["group"], ref.columns)].copy()["gene"] + ) g = set.intersection(set(ref.index), g) ref = ref.loc[list(g)].copy() - input = rna.loc[list(g), i].copy() + # print(rna.loc[list(g), str(i)]) + input = rna.loc[list(g), str(i)].copy() # predict = ref.apply(lambda i: np.round(stats.spearmanr(pd.to_numeric(i), pd.to_numeric(input), nan_policy='omit')[0], 3)) - predict = ref.apply(lambda i: np.round(pd.Series(i).corr(pd.Series(input), method='spearman'), 3)) # trying with pandas correlation + predict = ref.apply( + lambda i: np.round( + pd.Series(i).corr(pd.Series(input), method="spearman"), 3 + ) + ) # trying with pandas correlation predict = pd.DataFrame(predict) predict.columns = ["cor"] - predict = predict.sort_values(["cor"], ascending = False) - res = [[each.split("--")[0], each.split("--")[1], predict["cor"][index], i] for index, each in enumerate(predict.index) if index < n_predict] - res = pd.DataFrame(res, columns =['cell_type', 'atlas', "score", "input_index"]) + predict = predict.sort_values(["cor"], ascending=False) + res = [ + [each.split("--")[0], each.split("--")[1], predict["cor"][index], i] + for index, each in enumerate(predict.index) + if index < n_predict + ] + res = pd.DataFrame(res, columns=["cell_type", "atlas", "score", "input_index"]) return res # return list in the format cell_type, atlas, score, input_index # get a data in the format of cell_type, atlas, score, input_index - predicted_cell = Parallel(n_jobs=ncores, batch_size=32, verbose = 2)(delayed(second_correlation)(int(y), ref_data, rna, ct) for y in range(rna.shape[1])) + predicted_cell = Parallel(n_jobs=ncores, batch_size=32, verbose=2)( + delayed(second_correlation)(int(y), ref_data, rna, ct) + for y in range(rna.shape[1]) + ) # first concatenate in row wise manner predicted_cell = pd.concat(predicted_cell) - pivot = pd.pivot_table(predicted_cell, index='input_index', columns=predicted_cell.groupby('input_index').cumcount()+1, - values=['cell_type', 'atlas', 'score'], aggfunc='first') + pivot = pd.pivot_table( + predicted_cell, + index="input_index", + columns=predicted_cell.groupby("input_index").cumcount() + 1, + values=["cell_type", "atlas", "score"], + aggfunc="first", + ) - pivot = pivot[["cell_type", "atlas", "score"]] + pivot = pivot[["cell_type", "atlas", "score"]] # flatten multi-level column index - pivot.columns = ['_'.join(map(str, col)).strip() for col in pivot.columns.values] + pivot.columns = ["_".join(map(str, col)).strip() for col in pivot.columns.values] # rename columns column_names = {} for i in range(n_predict): - column_names[f'cell_type_{i+1}'] = f'predicted_cell_type_{i+1}' - column_names[f'atlas_{i+1}'] = f'source_atlas_{i+1}' - column_names[f'score_{i+1}'] = f'score_{i+1}' - + column_names[f"cell_type_{i+1}"] = f"predicted_cell_type_{i+1}" + column_names[f"atlas_{i+1}"] = f"source_atlas_{i+1}" + column_names[f"score_{i+1}"] = f"score_{i+1}" + pivot = pivot.rename(columns=column_names) - return pivot# return pandas dataframe in the format the same original R package + return pivot # return pandas dataframe in the format the same original R package + """ Geneset enrichment analysis using DISCO data """ -def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref_path : str = None, ncores : int = 1): + +def CELLiD_enrichment( + input: pd.DataFrame, + reference: pd.DataFrame = None, + ref_path: str = None, + ncores: int = 1, +): """Function to generate enrichment analysis based on the reference gene sets and following the DISCO pipeline. Args: @@ -229,7 +320,7 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # check input data # check if the input type is a DataFrame - if not(isinstance(input, pd.DataFrame)): + if not (isinstance(input, pd.DataFrame)): logging.error("The input must be a dataframe") # check if the user provide more than the needed column which are gene and fc @@ -240,7 +331,7 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref if input.shape[1] == 2: input_shape = 2 input.columns = ["gene", "fc"] - + # else we can just take a gene list for the enrichment else: input_shape = 1 @@ -248,7 +339,6 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # if the user does not provide reference geneset list if reference is None: - # check for the data path if it is provided by the user if ref_path is None: ref_path = "DISCOtmp" @@ -256,31 +346,45 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # if the path does not exist, we create a default directory if not os.path.exists(ref_path): os.mkdir(ref_path) - + # check if the file does not exist if not (os.path.exists(ref_path + "/ref_geneset.pkl")): # downloading the geneset data from disco database - response = requests.get(url=prefix_disco_url +"/getGeneSetPkl", timeout=timeout) + response = requests.get( + url=prefix_disco_url + "/getGeneSetPkl", timeout=timeout + ) open(ref_path + "/ref_geneset.pkl", "wb").write(response.content) - reference = pd.read_pickle(ref_path + "/ref_geneset.pkl", compression = {'method':'gzip','compresslevel':6}) + reference = pd.read_pickle( + ref_path + "/ref_geneset.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) else: # read the data into pandas dataframe for subsequent analysis - reference = pd.read_pickle(ref_path + "/ref_geneset.pkl", compression = {'method':'gzip','compresslevel':6}) + reference = pd.read_pickle( + ref_path + "/ref_geneset.pkl", + compression={"method": "gzip", "compresslevel": 6}, + ) # rename the name data to include the reference atlas reference["name"] = reference["name"] + " in " + reference["atlas"] - input["gene"] = input["gene"].str.upper() # convert all the gene to upper string as the reference gene name is in uppercase + input["gene"] = input[ + "gene" + ].str.upper() # convert all the gene to upper string as the reference gene name is in uppercase # condition when the user provide the fold change for the enrichment analysis if input_shape == 2: - input["fc"] = 2 ** input["fc"] # 2 to the power of fc - input = input.set_index("gene", drop = False) # set index to the gene - input = input.loc[list(set(reference["gene"]) & set(input['gene']))] # only get the common genes for comparison + input["fc"] = 2 ** input["fc"] # 2 to the power of fc + input = input.set_index("gene", drop=False) # set index to the gene + input = input.loc[ + list(set(reference["gene"]) & set(input["gene"])) + ] # only get the common genes for comparison # else we only look at the ranked gene list else: - input = input.set_index("gene", drop = False) - input = input.loc[list(set(reference["gene"]) & set(input['gene']))] # only get the common genes for comparison + input = input.set_index("gene", drop=False) + input = input.loc[ + list(set(reference["gene"]) & set(input["gene"])) + ] # only get the common genes for comparison # filter to get only the geneset that contain the input genes from the user # this might good different result from the website so lets remove it for # unique_names = reference[reference["gene"].isin(input["gene"])]["name"].unique() @@ -291,8 +395,8 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref # optimise the CELLiD_enrichment function atlas_dict = {} name_dict = {} - reference_atlas = reference['atlas'] - reference_name = reference['name'] + reference_atlas = reference["atlas"] + reference_name = reference["name"] # pre compute the parameters of the fisher exact test for the reference data for rownum in range(len(reference)): @@ -305,57 +409,105 @@ def CELLiD_enrichment(input : pd.DataFrame, reference : pd.DataFrame = None, ref name_dict[reference_name[rownum]] = [rownum] else: name_dict[reference_name[rownum]].append(rownum) - + # apply function to the dataframe - def process_unique_name(reference_filter, input, atlas_dfs, input_shape, atlas_dict): + def process_unique_name( + reference_filter, input, atlas_dfs, input_shape, atlas_dict + ): # use the compile`d pattern to search for matches - unique_name = reference_filter['name'].head(1).values[0] - atlas = reference_filter['atlas'].head(1).values + unique_name = reference_filter["name"].head(1).values[0] + atlas = reference_filter["atlas"].head(1).values # print(unique_name) reference_full = atlas_dfs[list(atlas_dict.keys()).index(atlas)] # reference_filter = reference.iloc[name_dict[unique_name]].copy() # subset the reference data # reference_full = reference.iloc[atlas_dict[atlas]].copy() # full reference to the atlas - reference_filter = reference_filter.set_index(["gene"], drop = False) # set gene as the index + reference_filter = reference_filter.set_index( + ["gene"], drop=False + ) # set gene as the index # reference_filter["gene"] = reference_filter.index - input_filter = input.loc[list(set(reference_full["gene"]) & set(input["gene"]))] # subset the input genes + input_filter = input.loc[ + list(set(reference_full["gene"]) & set(input["gene"])) + ] # subset the input genes # condition for no passed gene set if input_filter.empty: return None - + # different computation to either include the fold change in the fisher exact test if input_shape == 2: - a = (reference_filter.loc[np.intersect1d(reference_filter["gene"], (input_filter["gene"]))].iloc[:, 0] * input_filter.loc[np.intersect1d(input_filter["gene"], reference_filter["gene"]), "fc"]).sum() + 1 - b = input_filter.loc[np.setdiff1d(input_filter["gene"], reference_filter["gene"]), "fc"].sum() + 1 - c = reference_filter.loc[np.setdiff1d(reference_filter["gene"], input_filter["gene"])].iloc[:, 0].sum() + 1 - d = len(set(reference_full["gene"])) - len(set(reference_filter["gene"]).union(set(input_filter["gene"]))) + a = ( + reference_filter.loc[ + np.intersect1d(reference_filter["gene"], (input_filter["gene"])) + ].iloc[:, 0] + * input_filter.loc[ + np.intersect1d(input_filter["gene"], reference_filter["gene"]), "fc" + ] + ).sum() + 1 + b = ( + input_filter.loc[ + np.setdiff1d(input_filter["gene"], reference_filter["gene"]), "fc" + ].sum() + + 1 + ) + c = ( + reference_filter.loc[ + np.setdiff1d(reference_filter["gene"], input_filter["gene"]) + ] + .iloc[:, 0] + .sum() + + 1 + ) + d = len(set(reference_full["gene"])) - len( + set(reference_filter["gene"]).union(set(input_filter["gene"])) + ) else: a = len(np.intersect1d(reference_filter["gene"], input_filter["gene"])) + 1 b = len(np.setdiff1d(input_filter["gene"], reference_filter["gene"])) + 1 c = len(np.setdiff1d(reference_filter["gene"], input_filter["gene"])) + 1 d = len(set(reference_full["gene"])) - a - b - c - + # get both value from the fisher exact test # pval = pvalue(a, b, c, d).two_tail _, pval = stats.fisher_exact(np.array([[a, b], [c, d]])) - odds = (a * d)/(b * c) + odds = (a * d) / (b * c) # only get the significant p_value # we can make changes to the p_value as in like we allow the user to specify it so that they have more control on the result they can obtain if pval < 0.01: - return pd.Series({"pval": pval, "or": odds, "name": unique_name, - "gene": ",".join(reference_filter.loc[reference_filter["gene"].isin(input_filter["gene"]), "gene"]), - "background": len(set(reference_full["gene"])), "overlap": len(reference_filter.loc[reference_filter["gene"].isin(input_filter["gene"])]), "geneset": len(reference_filter)}) + return pd.Series( + { + "pval": pval, + "or": odds, + "name": unique_name, + "gene": ",".join( + reference_filter.loc[ + reference_filter["gene"].isin(input_filter["gene"]), "gene" + ] + ), + "background": len(set(reference_full["gene"])), + "overlap": len( + reference_filter.loc[ + reference_filter["gene"].isin(input_filter["gene"]) + ] + ), + "geneset": len(reference_filter), + } + ) else: return None # Split them into cores unique_name_dfs = [reference.iloc[name_dict[key], :] for key in name_dict.keys()] atlas_dfs = [reference.iloc[atlas_dict[key], :] for key in atlas_dict.keys()] - + # return unique_name_dfs, atlas_dfs, input, input_shape # return unique_name_dfs, atlas_dfs # using joblib to apply multiprocessing - results = Parallel(n_jobs=ncores, verbose = 2, batch_size = 64)(delayed(process_unique_name)(unique_name_df, input, atlas_dfs, input_shape, atlas_dict) for unique_name_df in unique_name_dfs) + results = Parallel(n_jobs=ncores, verbose=2, batch_size=64)( + delayed(process_unique_name)( + unique_name_df, input, atlas_dfs, input_shape, atlas_dict + ) + for unique_name_df in unique_name_dfs + ) results = [res for res in results if res is not None] # concatenate the result into the dataframe to the user @@ -363,8 +515,9 @@ def process_unique_name(reference_filter, input, atlas_dfs, input_shape, atlas_d res_df = pd.concat(results, axis=1).transpose() res_df["pval"] = res_df["pval"].astype(float).round(3) res_df["or"] = res_df["or"].astype(float).round(3) - res_df = res_df.sort_values(by=["pval", "or"], ascending=[True, False]).head(min(50, len(results))) + res_df = res_df.sort_values(by=["pval", "or"], ascending=[True, False]).head( + min(50, len(results)) + ) return res_df else: return None - \ No newline at end of file diff --git a/discotoolkit/DiscoClass.py b/discotoolkit/DiscoClass.py index a055cb2..ab3d4fe 100644 --- a/discotoolkit/DiscoClass.py +++ b/discotoolkit/DiscoClass.py @@ -6,10 +6,10 @@ class Filter: """ Filter class object to save the attributes for filtering the dataset from DISCO - sample String e.g. GSM3891625_3; - project String; - tissue String e.g. Lung, Bladder; - disease String e.g. PDAC; + sample_id String e.g. ERX2757110; + project_id String; + tissue String e.g. lung, bladder; + disease String e.g. COVID-19; platform String e.g. 10x3'; sample_type String; cell_type String; @@ -20,12 +20,12 @@ class Filter: return Class object """ - def __init__(self, sample = None, project = None, tissue = None, disease = None, platform = None, sample_type = None, + def __init__(self, sample_id = None, project_id = None, tissue = None, disease = None, platform = None, sample_type = None, cell_type = None, cell_type_confidence : str = "medium", include_cell_type_children : bool = True, min_cell_per_sample : int = 100): # handling for string and list input - self.sample = self.convert_to_list(sample) # sample id - self.project = self.convert_to_list(project) # project, lab, or dataset from different author + self.sample_id = self.convert_to_list(sample_id) # sample id + self.project_id = self.convert_to_list(project_id) # project, lab, or dataset from different author self.tissue = self.convert_to_list(tissue) # organ tissue self.disease = self.convert_to_list(disease) # cancer or non cancer, or COVID-1e9 disease self.platform = self.convert_to_list(platform) # sequencing platform diff --git a/discotoolkit/DownloadDiscoData.py b/discotoolkit/DownloadDiscoData.py index 273986a..484c7d4 100644 --- a/discotoolkit/DownloadDiscoData.py +++ b/discotoolkit/DownloadDiscoData.py @@ -1,11 +1,11 @@ -''' -Descripttion: -version: +""" +Descripttion: +version: Author: Mengwei Li Date: 2023-04-14 16:33:44 LastEditors: Mengwei Li -LastEditTime: 2023-04-16 12:14:48 -''' +LastEditTime: 2024-10-23 21:30:03 +""" # import libraries import requests @@ -14,14 +14,15 @@ import re import os import hashlib +import scanpy as sc # import variable and class from other script from .GlobalVariable import logging, prefix_disco_url from .DiscoClass import FilterData, Filter from .GetMetadata import check_in_list -def download_disco_data(metadata, output_dir : str = "DISCOtmp"): +def download_disco_data(metadata, output_dir: str = "DISCOtmp"): """function to download the data based on the given filter Args: metadata (FilterData) : FilterData class to filters data from DISCO database @@ -29,116 +30,209 @@ def download_disco_data(metadata, output_dir : str = "DISCOtmp"): Returns: None: This function does not return any object and instead download the data for the user - """ + """ # define a list to store the error sample that can not be download error_sample = [] # create directory known as DISCOtmp or based on user defined directory if it does not exist - if not(os.path.exists(output_dir)): + if not (os.path.exists(output_dir)): os.mkdir(output_dir) + samples = list(metadata.sample_metadata["sample_id"]) # download data when the user does not want to subset to specific cell type - if metadata.filter.cell_type is None: - - # getting the dataframe of the samples - samples = metadata.sample_metadata - - # iterate through all the samples - for i in range(len(samples)): - s = list(samples["sampleId"])[i] - p = list(samples["projectId"])[i] - output_file = "%s/%s.h5ad" % (output_dir, s) # getting the name of the file - - # condition if the file has been downloaded before - if (os.path.exists(output_file)) and \ - hashlib.md5(open(output_file, "rb").read()).hexdigest() == list(samples["md5h5ad"])[i]: - logging.info(" %s has been downloaded before. Ignore ..." % (s)) # giving message to the user - else: - logging.info("Downloading data of %s" % (s)) # giving message to the user - - try: - # try to download the data as requested - response = requests.get(url=prefix_disco_url + "getH5adBySample?project="+ p +"&sample=" + s) - - # condition for error in downloading the samples - if response.status_code == 404: - error_sample.append(s) # append the error samples into a list for debugging later on - logging.warning("sample %s download fail" % (s)) # giving message to the user - else: - # write the file to directory - open(output_file, "wb").write(response.content) - - # condition for another error - if hashlib.md5(open(output_file, "rb").read()).hexdigest() != list(samples["md5h5ad"])[i]: - error_sample.append(s) - os.remove(output_file) - logging.warning("sample %s download fail" % (s)) # give message to the user - except: - # except error when the download fail - error_sample.append(s) - logging.warning("sample %s download fail" % (s)) - - # if there are error samples, we set all the attribute for FilterData object to the error samples and return the object in FilterData object - if len(error_sample) > 0: - metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") # change from inplace to this to increase efficiency - metadata.sample_metadata = metadata.sample_metadata.loc[error_sample] - metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[check_in_list(metadata.cell_type_metadata["sampleId"], error_sample)] - metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() - metadata.sample_count = len(error_sample) - return metadata - - else: - # get the dataframe for the metadata - metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") - samples = metadata.cell_type_metadata - concat_func = lambda x,y: x + "_" + str(y) - - # create new element by concatenating the sampleId and cluster - samples["sampleId"] = list(map(concat_func, list(samples["sampleId"]), - list(samples["cluster"]))) - - # similarly, iterative through all the sample for downloading - for i in range(len(samples)): - s = list(samples["sampleId"])[i] - sub_s = s.replace("_" + str(list(samples["cluster"])[i]) , "") # adding handler for celltype specific to get the data - p = metadata.sample_metadata.loc[sub_s]["projectId"] - output_file = "%s/%s.h5ad" % (output_dir, s) + + samples = metadata.sample_metadata + + for i in range(len(samples)): + s = list(samples["sample_id"])[i] + p = list(samples["project_id"])[i] + + output_file = "%s/%s.h5ad" % (output_dir, s) + if os.path.exists(output_file): + os.remove(output_file) + + url = prefix_disco_url + "download/getRawH5/" + p + "/" + s + rna = sc.read_10x_h5(filename=output_file, backup_url=url) + + cell = pd.read_csv( + prefix_disco_url + "toolkit/" + "getCellTypeSample?sampleId=" + s, + sep="\t", + header=0, + ) + # cell.set_index("cell_id") + cell = cell.iloc[:, [0, 2, 5]] + if metadata.filter.cell_type is not None: + if metadata.filter.cell_type_confidence == "high": + cell = cell.loc[cell["cell_type_score"] >= 0.8] + + elif metadata.filter.cell_type_confidence == "medium": + cell = cell.loc[cell["cell_type_score"] >= 0.6] + cell = cell.loc[cell.cell_type.isin(metadata.filter.cell_type)] - # checking for file and ignore if it is already exist - if (os.path.exists(output_file)) and \ - hashlib.md5(open(output_file, "rb").read()).hexdigest() == list(samples["h5adMd5"]): - logging.info(" %s has been downloaded before. Ignore ..." % (s)) # give message to the user - else: - logging.info("Downloading data of %s" % (s)) # give message to the user - try: - # get the data from the server - response = requests.get(url=prefix_disco_url + "getH5adBySampleCt?project="+ p +"&sample=" + s) - if response.status_code == 404: - error_sample.append(s) # append error samples - logging.warning("sample %s download fail 1" % (s)) # give message to the user - else: - open(output_file, "wb").write(response.content) - - # condition for another error - if hashlib.md5(open(output_file, "rb").read()).hexdigest() != list(samples["h5adMd5"])[i]: - error_sample.append(s) - os.remove(output_file) - logging.warning("sample %s download fail" % (s)) # give message to the user - except: - error_sample.append(s) - logging.warning("sample %s download fail" % (s)) # give message to the user - - # similarly, record the error samples and return to the variable - if len(error_sample) > 0: - metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[check_in_list(metadata.cell_type_metadata["sampleId"], error_sample)] - metadata.sample_metadata = metadata.sample_metadata.iloc[check_in_list(list(metadata.sample_metadata["sampleId"]), metadata.cell_type_metadata["sampleId"])] - metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() - metadata.sample_count = len(error_sample) - sample_cell_count = pd.DataFrame(metadata.cell_type_metadata.groupby(["sampleId"])["cellNumber"].agg("sum")) - sample_cell_count.columns = ["x"] - metadata.sample_metadata["cell_number"] = list(sample_cell_count.loc[list(metadata.sample_metadata["sampleId"])]["x"]) - return metadata - - # else return None as the data has been downloaded - return None \ No newline at end of file + rna = rna[cell.cell_id].copy() + rna.obs["cell_type"] = list(cell.cell_type) + rna.obs["sample_id"] = s + rna.obs["project_id"] = p + rna.write(output_file) + + # if metadata.filter.cell_type is None: + # # getting the dataframe of the samples + # samples = metadata.sample_metadata + + # # iterate through all the samples + # for i in range(len(samples)): + # s = list(samples["sampleId"])[i] + # p = list(samples["projectId"])[i] + # output_file = "%s/%s.h5ad" % (output_dir, s) # getting the name of the file + + # # condition if the file has been downloaded before + # if (os.path.exists(output_file)) and hashlib.md5( + # open(output_file, "rb").read() + # ).hexdigest() == list(samples["md5h5ad"])[i]: + # logging.info( + # " %s has been downloaded before. Ignore ..." % (s) + # ) # giving message to the user + # else: + # logging.info( + # "Downloading data of %s" % (s) + # ) # giving message to the user + + # try: + # # try to download the data as requested + # response = requests.get( + # url=prefix_disco_url + # + "getH5adBySample?project=" + # + p + # + "&sample=" + # + s + # ) + + # # condition for error in downloading the samples + # if response.status_code == 404: + # error_sample.append( + # s + # ) # append the error samples into a list for debugging later on + # logging.warning( + # "sample %s download fail" % (s) + # ) # giving message to the user + # else: + # # write the file to directory + # open(output_file, "wb").write(response.content) + + # # condition for another error + # if ( + # hashlib.md5(open(output_file, "rb").read()).hexdigest() + # != list(samples["md5h5ad"])[i] + # ): + # error_sample.append(s) + # os.remove(output_file) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + # except: + # # except error when the download fail + # error_sample.append(s) + # logging.warning("sample %s download fail" % (s)) + + # # if there are error samples, we set all the attribute for FilterData object to the error samples and return the object in FilterData object + # if len(error_sample) > 0: + # metadata.sample_metadata = metadata.sample_metadata.set_index( + # "sampleId" + # ) # change from inplace to this to increase efficiency + # metadata.sample_metadata = metadata.sample_metadata.loc[error_sample] + # metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[ + # check_in_list(metadata.cell_type_metadata["sampleId"], error_sample) + # ] + # metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() + # metadata.sample_count = len(error_sample) + # return metadata + + # else: + # # get the dataframe for the metadata + # metadata.sample_metadata = metadata.sample_metadata.set_index("sampleId") + # samples = metadata.cell_type_metadata + # concat_func = lambda x, y: x + "_" + str(y) + + # # create new element by concatenating the sampleId and cluster + # samples["sampleId"] = list( + # map(concat_func, list(samples["sampleId"]), list(samples["cluster"])) + # ) + + # # similarly, iterative through all the sample for downloading + # for i in range(len(samples)): + # s = list(samples["sampleId"])[i] + # sub_s = s.replace( + # "_" + str(list(samples["cluster"])[i]), "" + # ) # adding handler for celltype specific to get the data + # p = metadata.sample_metadata.loc[sub_s]["projectId"] + # output_file = "%s/%s.h5ad" % (output_dir, s) + + # # checking for file and ignore if it is already exist + # if (os.path.exists(output_file)) and hashlib.md5( + # open(output_file, "rb").read() + # ).hexdigest() == list(samples["h5adMd5"]): + # logging.info( + # " %s has been downloaded before. Ignore ..." % (s) + # ) # give message to the user + # else: + # logging.info("Downloading data of %s" % (s)) # give message to the user + # try: + # # get the data from the server + # response = requests.get( + # url=prefix_disco_url + # + "getH5adBySampleCt?project=" + # + p + # + "&sample=" + # + s + # ) + # if response.status_code == 404: + # error_sample.append(s) # append error samples + # logging.warning( + # "sample %s download fail 1" % (s) + # ) # give message to the user + # else: + # open(output_file, "wb").write(response.content) + + # # condition for another error + # if ( + # hashlib.md5(open(output_file, "rb").read()).hexdigest() + # != list(samples["h5adMd5"])[i] + # ): + # error_sample.append(s) + # os.remove(output_file) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + # except: + # error_sample.append(s) + # logging.warning( + # "sample %s download fail" % (s) + # ) # give message to the user + + # # similarly, record the error samples and return to the variable + # if len(error_sample) > 0: + # metadata.cell_type_metadata = metadata.cell_type_metadata.iloc[ + # check_in_list(metadata.cell_type_metadata["sampleId"], error_sample) + # ] + # metadata.sample_metadata = metadata.sample_metadata.iloc[ + # check_in_list( + # list(metadata.sample_metadata["sampleId"]), + # metadata.cell_type_metadata["sampleId"], + # ) + # ] + # metadata.cell_count = metadata.cell_type_metadata["cellNumber"].sum() + # metadata.sample_count = len(error_sample) + # sample_cell_count = pd.DataFrame( + # metadata.cell_type_metadata.groupby(["sampleId"])["cellNumber"].agg( + # "sum" + # ) + # ) + # sample_cell_count.columns = ["x"] + # metadata.sample_metadata["cell_number"] = list( + # sample_cell_count.loc[list(metadata.sample_metadata["sampleId"])]["x"] + # ) + # return metadata + + # # else return None as the data has been downloaded + # return None diff --git a/discotoolkit/GetMetadata.py b/discotoolkit/GetMetadata.py index d641320..82f681a 100644 --- a/discotoolkit/GetMetadata.py +++ b/discotoolkit/GetMetadata.py @@ -14,7 +14,8 @@ from .GlobalVariable import logging, prefix_disco_url from .DiscoClass import FilterData, Filter -def get_json(url : str, info_msg: str, error_msg: str, prefix : bool = True): + +def get_json(url: str, info_msg: str, error_msg: str, prefix: bool = True): """Funtion to get json data from the server Args: @@ -26,36 +27,37 @@ def get_json(url : str, info_msg: str, error_msg: str, prefix : bool = True): Returns: JSON: return JSON data which will be converted into pandas dataframe later on """ - + # giving log message to the user logging.info(info_msg) # condition for adding prefix - if prefix == False: + if not prefix: response = requests.get(url) else: - response = requests.get(prefix_disco_url + "/" + url) - + response = requests.get(prefix_disco_url + url) + # if the loading of JSON is successful, we get the text if response.status_code == 200: data = json.loads(response.text) - return data # return json data Dictionary datatype in python + return data # return json data Dictionary datatype in python else: - # print informative message to the user logging.error(error_msg) + def get_sample_ct_info(): - """ Get cell type information of sample + """Get cell type information of sample Returns: Pandas DataFrame: return pandas dataframe to the user - """ - temp = get_json(url = "/getSampleCtInfo", info_msg = "Retrieving cell type information of each sample from DISCO database", - error_msg = "Failed to retrieve cell type information. Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.") - return pd.DataFrame(temp) # return Dataframe of the JSON data + """ + temp = pd.read_csv( + prefix_disco_url + "toolkit/" + "getCellTypeSummary", sep="\t", header=0 + ) + return temp # return Dataframe of the JSON data -def find_celltype(term : str = "", cell_ontology : dict = None): +def find_celltype(term: str = "", cell_ontology: dict = None): """find the celltype within the disco dataset Args: @@ -68,9 +70,12 @@ def find_celltype(term : str = "", cell_ontology : dict = None): # when the user do not have cell ontology, we get the default one from disco database if cell_ontology is None: - cell_ontology = get_json(url = "/getCellOntology", info_msg = "Retrieving ontology from DISCO database", - error_msg = "Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.") - cell_type = pd.DataFrame(cell_ontology)["cell_name"] # convert to dataframe + cell_ontology = get_json( + url="toolkit/getCellOntology", + info_msg="Retrieving ontology from DISCO database", + error_msg="Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", + ) + cell_type = pd.DataFrame(cell_ontology)["cell_name"] # convert to dataframe # convert term to lower term for matching. the same apply for the cell_type term = term.lower() @@ -87,22 +92,30 @@ def find_celltype(term : str = "", cell_ontology : dict = None): # get the index and return the cell type in the correct case in the form of Python List return list(cell_type[idx_list]) -def get_disco_metadata(): +def get_disco_metadata(): """get metadata of the disco into the dataframe Returns: Pandas Dataframe: Disco metadata in the format of dataframe which will be used for the filter class - """ + """ - metadata = get_json(url = "http://www.immunesinglecell.org/api/vishuo/sample/all", info_msg = "Retrieving metadata from DISCO database", - error_msg = "Failed to retrieve metadata. Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", prefix=False) - metadata = pd.DataFrame(metadata) - metadata = metadata[metadata["processStatus"] == "QC pass"] # filtering to only get the sample that pass quality control - metadata.set_index("sampleId") # setting the index of the metadata to sample_id for consistency + metadata = pd.read_csv( + "https://immunesinglecell.org/disco_v3_api/toolkit/getSampleMetadata", + sep="\t", + header=0, + ) + # metadata = pd.DataFrame(metadata) + # metadata = metadata[ + # metadata["processStatus"] == "QC pass" + # ] # filtering to only get the sample that pass quality control + metadata.set_index( + "sample_id" + ) # setting the index of the metadata to sample_id for consistency return metadata -def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = None): + +def get_celltype_children(cell_type: Union[str, list], cell_ontology: dict = None): """get the children of the input celltype from the user Args: @@ -113,8 +126,13 @@ def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = No List of String: return the children of the defined celltype in list of String """ if cell_ontology is None: - cell_ontology = pd.DataFrame(get_json(url="/getCellOntology", info_msg = "Retrieving ontology from DISCO database", - error_msg = "Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.")) + cell_ontology = pd.DataFrame( + get_json( + url="toolkit/getCellOntology", + info_msg="Retrieving ontology from DISCO database", + error_msg="Failed to retrieve ontology Please try again. If the issue persists, please contact us at li_mengwei@immunol.a-star.edu.sg for assistance.", + ) + ) # define an empty list children = [] @@ -129,13 +147,20 @@ def get_celltype_children(cell_type: Union[str , list], cell_ontology: dict = No # while loop to get all the children celltypes iteratively while len(cell_type) > 0: - idx_list = [index for index, each in enumerate(list(cell_ontology["parent"])) if each in cell_type] + idx_list = [ + index + for index, each in enumerate(list(cell_ontology["parent"])) + if each in cell_type + ] children.extend(list(cell_ontology["cell_name"][idx_list])) cell_type = list(cell_ontology["cell_name"][idx_list]) - children = list(set(children)) # ensure that the obtained children of celltype are unique + children = list( + set(children) + ) # ensure that the obtained children of celltype are unique return children + def list_metadata_item(field: str): """List element inside the metadata columns @@ -153,24 +178,25 @@ def list_metadata_item(field: str): if field in metadata.columns: return list(set(metadata[field])) else: - logging.warning("DISCO data don't contain '%s' field. Please check the field name" % (field)) + logging.warning( + "DISCO data don't contain '%s' field. Please check the field name" % (field) + ) return None - -def list_all_columns(): + +def list_all_columns(): """list all the columns found in the metadata of the disco database Returns: List: return the name of the metadata in the form of list of string - """ - + """ + # get the disco metadata metadata = get_disco_metadata() return list(metadata.columns) -def filter_disco_metadata(filter : Filter = Filter()): - +def filter_disco_metadata(filter: Filter = Filter()): """filter function option for the disco data Args: Filter (Class): predefined Filter class with default attribute to filter data for the user @@ -182,15 +208,17 @@ def filter_disco_metadata(filter : Filter = Filter()): # starting with defining variable filter_data = FilterData() - metadata = get_disco_metadata() # get metadata - logging.info("Filtering sample") # producing log message to the user + metadata = get_disco_metadata() # get metadata + logging.info("Filtering sample") # producing log message to the user # condition for each provided attribute by the user or default - if filter.sample is not None: - metadata = metadata.iloc[check_in_list(metadata["sampleId"], filter.sample)] + if filter.sample_id is not None: + metadata = metadata.iloc[check_in_list(metadata["sample_id"], filter.sample_id)] - if filter.project is not None: - metadata = metadata.iloc[check_in_list(metadata["projectId"], filter.project)] + if filter.project_id is not None: + metadata = metadata.iloc[ + check_in_list(metadata["project_id"], filter.project_id) + ] if filter.tissue is not None: metadata = metadata.iloc[check_in_list(metadata["tissue"], filter.tissue)] @@ -202,81 +230,130 @@ def filter_disco_metadata(filter : Filter = Filter()): metadata = metadata.iloc[check_in_list(metadata["disease"], filter.disease)] if filter.sample_type is not None: - metadata = metadata.iloc[check_in_list(metadata["sampleType"], filter.sample_type)] + metadata = metadata.iloc[ + check_in_list(metadata["sample_type"], filter.sample_type) + ] # condition when there is no samples with the provided filters if len(metadata) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - + # get the celltype information sample_ct_info = get_sample_ct_info() # remove the unnecessary fields from the metadata and only keep the below defined columns - retain_field = ["sampleId", "projectId", "sampleType", "anatomicalSite", "disease", - "tissue", "platform", "ageGroup", "age", "gender", "cellSorting", - "diseaseSubtype", "diseaseStage", "treatment", "md5h5ad"] - + # retain_field = [ + # "sample_id", + # "projectId", + # "sample_type", + # "anatomicalSite", + # "disease", + # "tissue", + # "platform", + # "ageGroup", + # "age", + # "gender", + # "cellSorting", + # "diseaseSubtype", + # "diseaseStage", + # "treatment", + # "md5h5ad", + # ] + # subset to the retained field - metadata = metadata[retain_field] + # metadata = metadata[retain_field] # checking for the specific cell type if filter.cell_type is None: - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] - sample_cell_count = pd.DataFrame(sample_ct_info.groupby(["sampleId"])["cellNumber"].agg("sum")) # get the total number of cells - sample_cell_count.columns = ["x"] # assigning different column name - filtered_index = np.intersect1d(sample_cell_count.index, list(metadata["sampleId"])) # getting the common sampleId from the ct info and metadata info - metadata = metadata.loc[metadata['sampleId'].isin(filtered_index)] # filtering to only the common sampleId - metadata["cell_number"] = list(sample_cell_count.loc[filtered_index]["x"]) # assigning to the metadata - metadata = metadata[metadata["cell_number"] > filter.min_cell_per_sample] # filter by the minimum cell per sample + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] + sample_cell_count = pd.DataFrame( + sample_ct_info.groupby(["sample_id"])["cell_number"].agg("sum") + ) # get the total number of cells + sample_cell_count.columns = ["x"] # assigning different column name + filtered_index = np.intersect1d( + sample_cell_count.index, list(metadata["sample_id"]) + ) # getting the common sample_id from the ct info and metadata info + metadata = metadata.loc[ + metadata["sample_id"].isin(filtered_index) + ] # filtering to only the common sample_id + metadata["cell_number"] = list( + sample_cell_count.loc[filtered_index]["x"] + ) # assigning to the metadata + metadata = metadata[ + metadata["cell_number"] > filter.min_cell_per_sample + ] # filter by the minimum cell per sample # condiition for none if len(metadata) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - # subsetting based on the sampleId - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] - + # subsetting based on the sample_id + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] + # now setting the attribute of the FilterData filter_data.sample_metadata = metadata filter_data.cell_type_metadata = sample_ct_info filter_data.sample_count = len(metadata) filter_data.cell_count = sum(metadata["cell_number"]) - filter_data.filter = filter # now set all the changes we made to the Filter object + filter_data.filter = ( + filter # now set all the changes we made to the Filter object + ) # giving the message to the user - logging.info("%s samples and %s cells were found" % (filter_data.sample_count, filter_data.cell_count)) + logging.info( + "%s samples and %s cells were found" + % (filter_data.sample_count, filter_data.cell_count) + ) return filter_data - + # condition to get all the data for the sub celltype if filter.include_cell_type_childen: filter.cell_type = get_celltype_children(filter.cell_type) # subset to the given cell type and sample - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["cellType"], filter.cell_type)] - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], list(metadata["sampleId"]))] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["cell_type"], filter.cell_type) + ] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], list(metadata["sample_id"])) + ] # condition for different values of the cell_type_confidence if filter.cell_type_confidence not in ["high", "medium", "all"]: - logging.warning("cell_type_confidence can only be high, medium, or all") # give the user an informative message + logging.warning( + "cell_type_confidence can only be high, medium, or all" + ) # give the user an informative message if filter.cell_type_confidence == "high": - sample_ct_info = sample_ct_info[sample_ct_info["cellTypeScore"] >= 0.8] + sample_ct_info = sample_ct_info[sample_ct_info["cell_type_score"] >= 0.8] elif filter.cell_type_confidence == "medium": - sample_ct_info = sample_ct_info[sample_ct_info["cellTypeScore"] >= 0.6] + sample_ct_info = sample_ct_info[sample_ct_info["cell_type_score"] >= 0.6] # again condition for no sample found and return None if len(sample_ct_info) == 0: logging.warn("Sorry, no samples passed the applied filters.") return None - + # follow from previous block of code - metadata = metadata.iloc[check_in_list(list(metadata["sampleId"]), sample_ct_info["sampleId"])] - sample_cell_count = pd.DataFrame(sample_ct_info.groupby(["sampleId"])["cellNumber"].agg("sum")) # total cell numbers + metadata = metadata.iloc[ + check_in_list(list(metadata["sample_id"]), sample_ct_info["sample_id"]) + ] + sample_cell_count = pd.DataFrame( + sample_ct_info.groupby(["sample_id"])["cell_number"].agg("sum") + ) # total cell numbers sample_cell_count.columns = ["x"] - - metadata["cell_number"] = list(sample_cell_count.loc[list(metadata["sampleId"])]["x"]) # count number of cell - metadata = metadata[metadata["cell_number"] > filter.min_cell_per_sample] # filter based on min cell per sample + + metadata["cell_number"] = list( + sample_cell_count.loc[list(metadata["sample_id"])]["x"] + ) # count number of cell + metadata = metadata[ + metadata["cell_number"] > filter.min_cell_per_sample + ] # filter based on min cell per sample # condition for None if len(metadata) == 0: @@ -284,7 +361,9 @@ def filter_disco_metadata(filter : Filter = Filter()): return None # subsetting before assign attribute for the FilterData object - sample_ct_info = sample_ct_info.iloc[check_in_list(sample_ct_info["sampleId"], metadata["sampleId"])] + sample_ct_info = sample_ct_info.iloc[ + check_in_list(sample_ct_info["sample_id"], metadata["sample_id"]) + ] # now assign all the values to the attribute of the object filter_data.sample_metadata = metadata @@ -292,9 +371,13 @@ def filter_disco_metadata(filter : Filter = Filter()): filter_data.sample_count = len(metadata) filter_data.cell_count = sum(metadata["cell_number"]) filter_data.filter = filter - logging.info("%s samples and %s cells were found" % (filter_data.sample_count, filter_data.cell_count)) + logging.info( + "%s samples and %s cells were found" + % (filter_data.sample_count, filter_data.cell_count) + ) return filter_data + # utils function for getting the index of the element on the left list for subsetting if the element is exist in the right list. def check_in_list(var, whitelist): - return [index for index, each in enumerate(list(var)) if each in list(whitelist)] \ No newline at end of file + return [index for index, each in enumerate(list(var)) if each in list(whitelist)] diff --git a/discotoolkit/GlobalVariable.py b/discotoolkit/GlobalVariable.py index b737a21..ec626ab 100644 --- a/discotoolkit/GlobalVariable.py +++ b/discotoolkit/GlobalVariable.py @@ -1,3 +1,11 @@ +''' +Descripttion: +version: +Author: Mengwei Li +Date: 2023-07-06 16:59:59 +LastEditors: Mengwei Li +LastEditTime: 2024-10-23 15:52:29 +''' """ Global variable file to import for the subsequent script. @@ -17,9 +25,9 @@ timeout = 600 # Define package-level variable -response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl") +# response = requests.get("http://www.immunesinglecell.org/api/vishuo/getToolkitUrl") -if response.status_code == 200: - prefix_disco_url = json.loads(response.text)["url"] -else: - prefix_disco_url = "http://www.immunesinglecell.org/toolkitapi" \ No newline at end of file +# if response.status_code == 200: +# prefix_disco_url = json.loads(response.text)["url"] +# else: +prefix_disco_url = "https://immunesinglecell.org/disco_v3_api/" \ No newline at end of file diff --git a/discotoolkit/Utilities.py b/discotoolkit/Utilities.py new file mode 100644 index 0000000..1fd4b9c --- /dev/null +++ b/discotoolkit/Utilities.py @@ -0,0 +1,74 @@ +import h5py +import numpy as np +from scipy.sparse import csr_matrix +from pathlib import Path + + +def write_10X_h5(adata, file): + """Writes adata to a 10X-formatted h5 file. + + Note that this function is not fully tested and may not work for all cases. + It will not write the following keys to the h5 file compared to 10X: + '_all_tag_keys', 'pattern', 'read', 'sequence' + + Args: + adata (AnnData object): AnnData object to be written. + file (str): File name to be written to. If no extension is given, '.h5' is appended. + + Raises: + FileExistsError: If file already exists. + + Returns: + None + """ + + if ".h5" not in file: + file = f"{file}.h5" + if Path(file).exists(): + raise FileExistsError(f"There already is a file `{file}`.") + + def int_max(x): + return int(max(np.floor(len(str(int(max(x)))) / 4), 1) * 4) + + def str_max(x): + return max([len(i) for i in x]) + + w = h5py.File(file, "w") + grp = w.create_group("matrix") + grp.create_dataset( + "barcodes", + data=np.array(adata.obs_names, dtype=f"|S{str_max(adata.obs_names)}"), + ) + grp.create_dataset( + "data", data=np.array(adata.X.data, dtype=f"_7{x>2ScnB}f5H$RW$KE=k6&C@);Ko50RGW>R*dKS@oaI)%xk7}FK2V*N=$k(Aw% z|KvNj=g~a?xa8XYku(OkZ{NO;b6@A2d+vApk6bQ0hb!>+HGO}3j^q9ZJ>)M}&hym; zp5yLuA}8|wT$g(1yLk3&=rXdqsmsjnmM#msTf406ZtJqKyPD3>Up8Rxvh$ROvEMP^ z>~b={soyo=?s5-!x;*UJ+%F6WT>|`k$au}0C+M0l@9Qql`$UUq?e`BzS zA#b13qB@)S ztT`4B#$)ixp&`Nk|zF4}_vaab#yd7VeLSq$9z2Fl$l$$j{YtHPmyx zI~?im569v?hJ5vLFT=kP|6g??^d6VsQg~Q;iTuZ);}fEVU89BspUV05wN9)F~n{8bR2ihn)mk_imj6{E+0ot zuZ^iMc?9%P%}1>X>k2fOpP2^BZ%BiK<1qNmBfmTxl_U)7cqkAG_FN4_0yo3)tEzvX z#cmJaq6sh%!k~?@$L7Mt9^VS=4nzi9f|3**W&we~`K!UfP@pq#IdCDD#qdgzPHQ5) z8j24|5nUlH!;;)7-_rqBULXv5uZBT4BRX5B2tjd%}2` zBS@m|9eLrc0OoOEI5032iwCX-Z-fHztD!&))eS_32CjspK(seNlXoy0j>Kavn5rgG zjP`am*|MgAP%r|nL6yzgF9rLDLPsSjDrJqaxR^CW2D3&n8qe||F%8O|DeqBmkh8nruJ-yUnd7(>gq_JfE>-_20C^+@z)q(2%I zW35L{o2568O)hnok2;YcgvI=R5s!$(h@3Lj}19L`qvg(4yFv{1LEVY&x` zgVH(_w-*1gU&0yV7T0l3*SkCJ?3k<{-!-vo>`2Dp)IHkDCajbEgng_dW3?qaCXP>b zOuYQVAB_FrvAa55b5?erQ{3m$wsRS~9z;b{TnH9rj7nSz4Gwd%syq~uDR`hW%x_`XC3tS$ffM*lsc(y4_S;%pH`|j&m8`_;b-=_4*HvQDO>l;-UEvFfb2Ms zHXZm1Rp)eUm)Hiep|5;7wmbG;6a1J#G>r0bik!QO%rRA9VflA-Pn59bm_J@%wO}j- zMpqh-2csWLJzq(paRepmS#a#i@+?L)>ErQSJg;jYk0Qg13sejB63vu+BweMvL6^PwGvIaHn4pJwz?Cp>C1p8zA;URsCUSCG^ zSQDHvE-f5Ve>46`|Gic3ubMt0d+QZ%eG1O{!dr>POOr2ah{m#}!C?Gq))E4jkH@m+ zf$L&el1M4Zn!=IZXx1i$zBd%2cScYlk;bpKuqiqif)u$aVRU4zJ<&)ULMm$*6tfN~ z6coD$!#&sg*_-~Mfd~pBf*~wI@g@`Hs`k0+V>BQvHW-b>LfJA5Otro-6UrlT!Wt!8 zR%i&R6EF7QKjuZBf%&^Rn~=6vJ#mz$9jh}{^~n?8SUABqar51`@4P*IRrYLBJew9d zzO3%C(>-xw^0Mq)t2o!Doog2@oYRj)kNuTX$M1E%-^j3g z7OJM4=`{yr;h-WMObZ7euWq=%_i=S&ddm^H`pAFkTQnMKeP42Lz9whVhSI$ilb2H- zxuSK}FAF;qVaI~7cV5^#cUjqYS{6DLp))OXKCY^ndOIUjPb2TmnN7`)_;l?d{1(l| zYWJ5MoUh4Ao>;VV-orflaJE`$>%d(W4k^N+v6GJ-p1a1$lkd5w+_Ixianz+vb*zWK zV4d|c8YydK?H%gNn%OYQmPNWT3B+(uTzZAxTOtsyp;%8rUqm4JnIJJ?INauL^S2GR zjkis=h1=fSzI&Wb92DkF!t$_0aDYJ<^jeM^=i&vu7A%pvmtA@W7;7m>Lp8}=E9u1$ zPz830oC{iTscz46VRga#UJN&rex!X{9jyCizq4U?bq3^ldkWutc*oJEUE486v-1tsg(%8$x zd|`_>7kg}+zgCrh(znAB)y0Om)%m!hQ8fOJAz{<5I`8oD7xV8qF14!YFZ$x!ivDr# zT0zaym#vG%uM|r^PDab(yNY=y%Jeyk{_Bj_Nh@)i3VnQVzo+F^xomEQaU)ih=7ze` zl80UK19>0jO9`FD)&3puj^|FpCk6Cz^)1_|U97pq-4VvQRG?T|(U-7`tHjmg{KMLh z!A(CY@bgheTsun(;R%PnMxzex$VoW53q2{Wxy6k-6OI+ib?WoE<4u~9hF*iX_Ll7q zRzBlCv2L6n=f=Hyx_HjO#Sd%fw}=6;{^7chA?ZFLX~(XlH_0ctq_5W`Hej3|&4-ix zy4H|?66=bstf!`Q%9luoy}qo}Chgn2U%!|T z>xc{WapHzQ9w{R#fnYt)r+sqc!l9d(I~SfIFybN*UmK7MK46Rz$8X9#nn zZe44ZEar}I-)$fDh)oHPO4GmO9vBjCakKV$FA5U3XhRTVtM-}2Xx4@xhVGeSOzcRw zQcZcPwfBVkVT-;MsSU7nY7JImARi9VvI&b0tcfB~m|n?HGZb68USVq{QqhO7ej@XX z#()vH5{lmpg(8KORG^2lC0j=z^cGv7P_|f>Q3gRNR9IW(R$uz1QA^+g<&P{FVaybE zHQaXupgrav|8(>SS^9h5DFohrC)BRk^esY*wLqh~n%CvX zUZ~-D8$p3k)==!qlg`CNBiFu-QEEh}_ zGzv>!27>)jm?GGeVlzc!p2aqalp8NFu-YS6x^G6M>#%x`v{6)=ILttc85az&ofqv4 z!l+cIPzRZyJHH_DGxBmrWH@kxY_c>Sihan&MCh#^Os8028^}>7a&Raf9RLf0QJ^1l zQM?9wusZm+wMZl}BoY%6Ndbviv_u?F+Ct7&a!7@fTF7aGGvZW@F`+)P#f+@d+G4~| zpT-I^vPy4-ITGqC7-Do*6k6>uD@p{OyL`@$()K!R9iu$sT^|KUl)g6Uk0JeKrcSOyWLi9#R>JJVM zlC)#|dTVYUV<0#j7>tq)D9nap7*eYzXJ+i_8IpoM!)!|erRQw>I;%CQ8*G?df`^8Q z*1V7?84RPa(54C|2P7z5)+}27wMLfGHr6XZ%WrII`)Gw+ON>GKTv<<{OEjCP`JT$!VuPWeG!TsZn3q~n z2lU5L+_IMQAt@Y+seNW5jS4&1=$~HJUD>jNFpCkF$XgDjH}Ejhv{F|r>Dm8`=&?g^ z#<*{6Tt(H?$p>O;+XvzMVcFlP_!}4e&GY_d?Cga9+%$LbPaVH-$Sp^dmLs2arTxva z|FYu0oGg3dsZQIfS>KR>mH59p0F7iKjX~$=CT&gnEtJE?+479@U*YviJ&0{;8q!NH zkrxf68`Lm(u{l%H2WE|0&+8T(#YJ05FQy8M18u>gMV~WfoV7D+90*5_feBL|!mu0b z2}MLOk1r?-)*4wOHYl4c5*cz}PQD6lE^C6ac%AjP)I-6RV2oM{GZ*`eKqY{GhkL*u zsqT5(u$OGZ@#tW8JJ1O1^~(LdBi9i4tjUVTVe zedtkee)VzLcS7-<_{^*LUP)R%_jo6Ur>kU7z2d1~@HEbQ8fPkGPpjf-P1{=4DOqSj z=el zII6=GBuZ9guwfMJnvl5mBE8zJw{g36$0K1XZpG>CD@B^0g)&itThP}LmRF5cI7O|y zY0!kaSM6tdZWaC`+Ri)D#h?@`97`BpSN%CSQDn@H!hP_*m#V$ zN^HM(Z1G~XxS~yZd`Udf)W?t8x=GZqYuuJ{=l$9{#S_h?TdsF(u&|URZM{a(3L8rS zIE4{rTdAi?Z40B8|IGf#zL?dh+4fys}QTHzeNeLeQ=<= z+`Te&^|9caDx0o<&pG8>5bEcJ`j6Iq*feugUiYH1?!~#?X`x;gjw!;i&-gEmhAM1x z!5QOJaS1jr9c$vH>&Q;mJ%#1OPw?ihF}IvJQ&>*$<0eYOaX)qZCQFNFPNQK<<7UxV zoG}Rv3~k)lXTnnCEOnIB4x*UAV;jKI_vvv59Ed(zlt{FVt1d}~$@eOptTm_2#HiZP zWyZ#|OO*Y;S#Mz*mhBt!&<0|?$XH#zcxd^Oa|+8B|K#wzCy@38uugIN-tD^6HEohz zYZTX->7e2Yq}Kn@=0Ds#b3yIX**0Ze`|QoRp`X3|D5UH>^;t0OJo|(tM-HrGvGrR< zMg~>!p|uzG!D$hlbug8jM0e;dsC>uqs!6~eH1@kyT@TQLdp&qggvF6Mgks4c`q|d9`V_Cs|m373TQoNh3v}4(_5X>PAWz^)5So^CC2&l<2v8loq@Px>yE#?}R+@mN!^^eschOgSN@CDa?Mc%Kj*Op=PUEhS7hgD#d-R( zUd4G4Q_^L}lyupXJhrl3!h*AQ-dX$Le5(Aznh)N%|HjNY+1aW%Ta$do?VaemD^9n) z*EiLtxYsVY8|K{&vU`K#-jFmt5z42m4@}eNKCs@mepH@1_oK=WE2lk*(6}Hp&kN16 z(5eWnN%Iq5#l5EYn-&cQmwOTAYMd{PrgG2%Cu#l0&Xre8ZJoX;`!*@QO-XBJbzmB| z2~WmXv*6n>@7s_XO7+cEq*Er4kLS6GW)T3&AM>5i$%(5-190*@XcYzgE;%&(r3rH0g_GkyU_UvL zQMQ)^C+yVwN9tD^(1j=tVl=b>`_qD`*#GX^7#p`0e8^fqqHwj=!jzO1=7EX4pk zQMoKJm#if>^{v6azTTlokFHY+y7~_hH78{Djsmux!t>Hg+ZAz*%m+A5r&;Rt&S%ko zAwKp&eDGPm8@$tW4QNg*sQ`ZbZBRkTZz(*8o@5f_dkE4liJsYzwU#=FvH*g}e^LF- zl~10(clrIx4|+aoNX0%FxIZ9QY*i|@E>yfYU-6<`v0bUyUWlv;ZGU0y-Cf}E-C4WF z?}|l-q@Iw9?dis{jxbHQGZD}&y#o@OHcK=kvi5K!Or|87H})`gR{(Mc5KPr*#t}o+ zbF74BhxJs2ESW=uW0h!MRN6z%UUEn$k@k~wkem*3?Bo#BmJX3~n4BZz93$tSlS89K zqW)I}%|9a_328_b0H3TkgHi+wS?M-~pCBhm4z1R*W~^UfZzsTE)m?$&pP6r(^hqao|WA{v;iZYnCKiaVN~>da^Jy&R8k(nSI-cO5)< zP(@+6^)p@~1I6q3kM*MRB&+Cx;bc2!78KK}RBzhUESs7YQ!_jyOsXbh4-VZwB|DoG zXA>}4=2!SnyXK64{^~Ef`qgZ7(afm(#YFfz+~j)7ILI!$0}G zJNuIR9+O1N2(Od0A_X!pH7s1ss^m#TFZ03zGu`(=^#5aAn3HpsCl6@RYZ19&uk7Ea z`1cWdtuv2a+a(LT6#?k7I9Le@~MsAHvUYNJmq^&hNqXtnnkYQRCARz z_rCM~cQUrxnZ`NqBXkK@SqDU6r3D05TE;q24>b1csTR4SO{r)b>%{o3U9YU(GS;b> zsu%4h^9yLryyN8T)eB{7=F8Tk*S5-KFDPX%j2&9E)DR&nIXgIWRoYZJ`Mv4J)ZrPQ zY}%@rwx&&6pHv*XdpUJ!&hV%sU2#mVIHpt_(@Nf_`#<+qPR6E&Q{JhB>|GBWDxi&! z{}>!_F*;N~Rny3->7;6YXZmf7Jf)&#tn+i=H9Ir5%8Yw;Mp(NbY@8Q1rbhEaa{s)5 zp+~69M{TljB;#pGvtLH2T69=kCwL;@Ht!r}5HXId?3H9cm^t7RVG`@0I@pxq+f1of$4{(@* zzMMOJr1m#WInImTqNzn>=t+c?7sIG6C>AMxQUxSxArp8oI{gnTN?L&CNM$}&oOVWXSG+CnW3|vw zuWx%@JY6mI@kXH7>qCmv-{B%%K=u7UN41W76sjXu>g&mKVeXC-|8D*G7&Yt15G=@K zn9E&pf(qoWQhTgEe<0O<59c7w32T7_ZrqvD%))#=r6cPGFS!y{@Fu4|HI1T3eQ9Vh z_tctVjzwR>s;3LbuPi#6+|NamTc4U}niv`{34MN=Chmz5b_GtHD8m-*T0Lh-1@d%h?|RNOY6s?4gFH>uYvbnqaazpg zE3XC6{n48Qq@h^qX}iJjV5lDm${cb{kHcgZ z@@2t@7mL_SkwG+793&1IQ`Ue#0wPA}n>QPYgsp+zo~Acj0&mj8n<|o#0DFu)vI#Kq z-t6tsyu&8~h+sejfk^7pVIU!ApgOouIF`5bU)hvZzP>KQGg@KK61#DPQ>SLps1{NiXsW8!z zH5V}E)UjFwKT?CSL8NFy{QZUxW~ zR%xQ;VaY*j0{9ir`Dv-up4CfOwfeoDLR~zd4~n$t<2_A79);X5t_~#wZJa<~u&L{T zNnUyzN3;v8&Uj&cft6ZuRmf?}PS~_hdf8K?uy~44#RG{(E#?C>{kk%BrC_vZYmc=| z5pE}zX>+FUdn|u$QLllB23lo7w^@l2SAG=f>-+4KJO>r_75kt_g{}qkH2r3M@Er6x zv?nQbqw*=ES3gEq%t2Q#eVUXf-v`>eu6*4n(Wide6Z-;;GxBeBRQ&_TzP`Nt_*$`q zV$bU8@U-!@Yzg*fC_&f8Pa8YSmhg|z`-w7Y@r0eZ4z=~QAAcy?XLKBh_MX_4C`;J; z3<*cV*=Oi0v1RFLW*{2Zs2eGq#0wJkgoAza2m55B9-N`FUuQW=e-Yo5%N)1jIo*Ui z;lU|f$MQA>?L5`@RH|SLDdvjOb$zR+R5WY(mj0ctPc^gPY0_fl9R4w>E^_#%m8X9v zI`5aP@UbGHFCe)(r9VS+sp=FNsMIY&(vprz|AwAP$CCaOPT%+ae|-FJMs6MWmfzZB z#TOs2^NnvmNNNU5uL4S;=Q{04M#QW&hw;ZIC24i|UPD#ZbQM}I+oH??udj!OW9n{b z8Dglg9d#>|P&rw1XkaittZvZ)*^|TNw25p#6(4~Z8pu*QG2GjWz#p?DLlMI5kS>yS z_J(5s0(Xbr!WjYpc-Ri_QM^&HY!4`oi8h4+5fTc@SU1#fl#~O4t9#30Z!8Zj%6;!B z7miOKWF06HAGcuppse|_ZcS!G{r!X~!eS(sah4Be`72qzC(Db{eWKPfZ>pt2(WB;k=Cmy+R zm(sXPZrH6f>|SU%INxyaQH9)aT*0sAMAA*0OIsIKx6iL`&$!ByC#Ku6AyvDBK5;=i zX0>)~tb&y4HB-Znoxp1JqIqkoUzw}tW)yWQQQ+n4ZLV$9?w#!Vf zQs185dHgervh$4WKdbo9reD33_FrN|{7+xS`wj1HnA(u>m49h9di`i3Y(cv@d~-#u z!Krk8+w4ZAe%IVyWU(ig1xx(3g+m>J^f&FD6u(CL+F#l@WLf@R#Z(1l=`iQA{6^rs z{=0{#-RSq~7TMpb_*-S+1x0w_=i43&CjjbICNJMEKi$JVUD>rQqgJ8--KSs&80_3LP|eYqx+h4YGVK7HvmS$Hkk zfuay|`DT%YbBb^-ec_@kTr9*~w)RV7E?2Qx?B;Es#n^pm4!*sN(DG`q%`OvubUoc2lOd?hj7<{)tR=qf)&!Q@eHMl2W^Up?24N?XJ14k8UWt zJLTFlO6{5Sx!2OQuc7N|RsoQSaCGkX5F0SeAL0ho6k&t1 zM(})mI0nmRd>|5-$bXH%+zBWhEOobq(2;ZQmS4mMEIw#wDh=GW$C}eFU^zl53B6|(N-3-g#G4>lQmOoQ}&r{A9`lJ z^p_62sCc)jZ{f1aesPfKF-q?f6;tH=3pl!dhk-#w5E4@8uDO693@kwqmMqW{KueDs z#?3fH33ab{c`J)sf)VNV0CRS4!iZ+Du*KlgasSaALI4%TUYt z0dfxkBkP|wKuG5xT;bnSlLZmI02(wOB+wvpZJcZMqLnM#K(N6EY@{pLOLwlCKAPG! zbA4{N2B~4xq*Bu^ahHxM(m~inBE>W}jV$!#uRq3DMKM8O5mqM%oi1h2yYC|6Qs~{Y z%>=+axfo4JOa%8X^*0%tDW9oJ#%Aj1k5Py|_$@?YItPRLOONn)0hoTnm0<>y|Kddi zG;NY-wXVU*(XO^MLk_J8R9M?T1RVF<$R%qY8pO#p=|518x5+s{4(Wa>`i^!|nO-OT z6?{PYAv(?@V3={zLkcM?d=Nkyq-S$4J`EX{-l7P1$oWfhh;KCsD&m@^Ap2UC2L0wK z>T67|(<*Spsj?OcADx7@>Cq3_7=00oPT<85BNWpQVm5yNno47Xm^3>L5DqZ7JP^*i zWFSlb8+=$50ZDiRJ1?wdvey$wCMa=%}8Zc&_D2;jCehi_|_Z95d(j-&>!x89!SHAtm&{A3l%Y-ub=T&u6iO2r+=}kLj z|IWXv0*;KnS_TJN27R@RP+H!}>U3?3>?I(Xvt4nvFF5ziJ99uX202eVPd~0+b>9sj zS$Z`f;mvv^Sq&k{Y6wYI<4>p8Uqm)vSKT*o#iCW?gEdQr$FjOs;-$_B(Som3BhC zjh#&P%BG5pzb0dILlRe5{?<}P-~AllG_eWC9nALR#&=GmC2i%C!Fg*HJ}!rbo^DV1 z@9&W-Hzv(VaU804#XZmao|I+g;-p6wwkg6koYoa8rpi>M_&d`#l!}eAca!4X^l8IP zX!h8{0eSOY1?}kFD?9fo&V8f|+L4yy%EqxXnToZHLhKKX(BzFS4tFLGsO^fR2KCF58%y+d)V zo7w!+)}OS>oA)T2_oN;4%WT^<+n9WL@&`~(t6Fd>Anj~@;;hU)#x=ITXG;{wK11z1;JuYl*#$^Y2-?>prq7nCCx z@F-VaR4Ol`{&{JflgT50@q<76fqdYsa^NgNR8@8CBvve*gZ%Gao$j3p&K*j-56JEV ziu*uL8ZD71D~8*+x|YQ<5=mdOL|-!(eh(mOaK?^f5O{V&22A(Ck=?r#_bvot`g*7z z{EsmgYo>S3c+*}iwYDhUEeqcEd2hSy-KltYqL;u0Ece$g)U?dkw0t@w*R(4&?F%(~ z=46=(8s+AYZ6Vx}JE-lW|p)89F3sFEX}xq1&%nCJ zqLHjc{c47KddCb-KA5vA7^Dz`cxSCwgR!gq;VX2ID2%tvfKbK!E$qYSEdXH0v73YM zhz?6wLPHwiWCl{G(ufor6>@G>KP!VzRSrbOP}Ya7PHe@B-N6{`kI@cq)_uG`dL`I@ zDJX>r$IVdZMJ##NeuTc9ei)$km_%SLi7b^_$MI17bSNIAJuvmW(mHzANzOPqBvIKp z3!oxbNg5NWw_1h<$f7%V_N@PG#Pl)qP>Pc$OF$>ba$`i1B;yAImUBq*JZHY^HyXk zDl`7-jL*O5Ic?;fvqL~F)I6S<*Xk`yJ&goe!V_rKalrK7%m#hYhF6MOu zf~?I+%`fM^X(1BbTwMbzvV5`ZbtCUz0^RMl#;J9ZYt(~y%3W)6q7(6DF%nP7g)P(s+ zk|c?i*5Wj(buu=o3m4QQvRm4QmGnL)-}}7*=Au&a1k0U zlnc^FpWU6fbn(*o__+^!=_<3(jDy6vG+*lIwq2v08j!O`i%~Pnqtb$*Xc}c)TGMR` zFL___Jr{YRr^)aP!FW`7>U;LN&csnfp(JF!qkNnG*Wp>|Ol{j-31QYTN4+tKR6Q<#4a4{voMw~e2)*t`0=*dY@p`+<5Q9WrR4 z-wu?CcF45K&cXh~I%HyR)D!QJiN8^%t3#$$nMgeWZ!r<>kOT_KCvWu>P<~lOS|hXM znJ|-2d%Y094=8hF$C3@*cD!U*K)lpWHSpp*-My|LalJ$}NJKB)mRw#kj3|b_c>M>& z5;YlG3Km3dYLr+v5UdB`5JE4)TL^C>97e$X7&_}q+8A_UCP>ojTgA_9%Iu<{SEyNj zNcG~h_0TjdMYD^3eRLqj?(2?RT^g$>x>*EIR4ZT>r;+gUrBGe+60)qDuvA{|)mVo5^iu zMm94e>ysOqiAH7um}xgVu$4W#nLWFHVRPms=yUsr_ECQltY{?Oar0BypK{-Y9U2Mgh5z3M%ZVCh z4GgR%p%<|omKWJk znDHVH6lUX2lEjWd$MEQuGr^nP6M&~+&XxdH$lvK>kJMkzZ={DC>ER}cB(v_(?4yfo zxBoekS-ZvXDDtTp-Wd8Z$1vyeNjyJOsZd$=QnFmJv>JR7Wv^S7zpW`oFw-T=nx)Dz z%Ylp+U4ZEDBD(3Y4A8!b&xo&s+l=psA)pGo3a}FXnqqG((If~b$oIkzLQ^E4igRK! z(i6MXBmnM}4}HQd-{(w`d?L=YE$~-yI3A5c_>thX^~Z7KIUIdqF#!h#WY-emVflW6kjr>3SXSo<-~-GNma5+{=Fo)x9r?l z%G7DyM^4+4U8__SwgkPpC@Hpt6+u$18hor2Q-kXbUQpT?c$=wEN5LBn|4OMXpU_?m z9CUj7HUNB31Sn)M;;k+M6^3j*0#q2X^(as%>9=|eC=3$6VFBb}8}oXD`qi?*I~>+9 zIplQ*i+n%4l)r+H<0b!Emyi3w&MtfKoL~2@m#k(en#fA<9t{1h< z~-QQPugqjEUd@#weJBmBZ2_AB??h63?T8I0GZMWFOm0ZXJST331CnTd=LxRb#7Y?Dm5|L7aFe@~Gn- zvv;&Dm18Z zq^C2}Y5oPV#o`MF`lQbd>1Lsz83lKwg4Jd1ZcfpxyqT+F9TxfTYr3ss&_Z^DT{y|O z(R@+OujDi%r)!q&a`Ou}ul~p`S6tq#+LfveEO6dOr0kU-jKl*kZI)QSfYyDsw+3U0 zs6xxm;4Tk|YRm(mc+{h^#_S#pyFK5&M#msK++%`js1)#K3Uj~%(J=3I{~+DE-;VYO zksa&9HUIFw;n_TDFhBd-Iy)ra|7NK*wy)26;9I+V{k35!L%V`C<}*g|*8fX^TG z{JtKx`^QD4p7n?1gs23rFF^?_{Fjjj5DAZZ(Xh_f_>$Wn8c_6e6>-YRh!fJN*oqlrmC(CgpY6 zvWTQE6RRs^8JlH^CQLdfGJ8z>*;9JwrVTlnaD-f{<{`_kO3NmeEP!C4tMuLpKxv}N zX=z?vmaDqmWe@q7r<^}h?J70~bYCV`mNQ9dXq|+>k3*>F1}*I znA}QW{m2Ft3V zRm+7mEnm;M;T~ynzw0wCm#xTl(G9A1;pwy7KzRimlS=U!lT#h30)@_&7bTjV;yMcqr^>fF1dgSt^v8*Yo zktdvij@2y9a6^;;jLzoG%Bt%(mq5EaShLk+#N|AdF!>_6f{L+;b6s`aPj5{XM(Kls7*V?zecU^a`SZ*m3p%BvrzegeL?QaXG%=)Wa3zZaNRs{1qOx-eH+ zJyn)9BSk+Lo?kuMdGhNZ|BTb1e8P*6z0U{nQKiIg| z38L7zBTj8xdmbJAO}cUC(+gV{T2aY~N*#14F!>GXHkn;vq%ppI(1{=K3fTGB5t1Hy z!ii6{gcFW%;vwS*?`|^BrlxmFJEg~mNp0?%*^KN)qV@OhU%Gc``(!IJZ{vv zS{JrSop54Pn`}FhsLwf(W6j8PJ2qbb#EGRJ-+mgq*o<9lk00E+yTdk*pL3-5pQN8k zmz?pr2HRkt#Sd&vI`P>j`=7=yH{+MzK-f-)1qsSits zDJ-*C?&_-P%fvK`CN`%w~wnT zxx{SAGU0+&Ak)bfJ`QxQ53Bes0)&dPRn)a5H)!Q$qq2&R60CSlHN^(CY-(DOd^pR_-ePi#*z~natdCJB#r2mXCdDCzPx>wwVvZ+*c^%7oy zDf#4De}}H)XBejA3o&d5k!}nzBO5VN+bFU@Ya1n+UTdT92CV~txIyb68ALyGu8C$} fZ*BB$^S`w=nraTWPH=_+Geh?l^pyVrQ*>UZ literal 0 HcmV?d00001 diff --git a/discotoolkit/__pycache__/GeneSearch.cpython-311.pyc b/discotoolkit/__pycache__/GeneSearch.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35e1c64907d3e2154c8e4172c22ddc7472ac44c1 GIT binary patch literal 6477 zcmcIITWs4_mLyF{lw@17Wh<5wJG33gR+`v(G<7G_CX>WTYInw&#%Vfn>u4xlTB1vf z8j^}1OQGr@i}4l#>H-~99i&)&%w!$JgUJ9pK=x~cL4hf-Kcqpx0)YYr7K=sxGTZ4- ze!Az9q8?70#R9uDb?>?Np2t0pd+wqCsjV$Q!1LiR@1m{_g7|kla36ma_<99^PY6UH zQY7Y_o}44?F*Vm>_ws)Jft!=Iy`4P>BkcV&OLwtCF zin;O#oalE^1PW9r0nCvBtckon$Pl?+GM(>XM(K z@_#Rt_c>I7@1<&xg?wwE9>{OtglC@FL-4z~HCfXQ_V2SX^Z#$--TU4kZ!>Cl-Ghrk z7uI}QK=W%svl{D!EiQ=qoy{a{SgR`*#MpRH#e zwZA^qOpmEwfklmc|AdBlg&KH;>r{0}Bbzxi;6ti<_StpD+SCJeoW+O>7ur#{5K`mr zm>?EwHbxywh+&0L;0DuhvY zp+gI69Sc-}Qcu)5F7~>xMr}o#c@#RE_P@7=3A;ex2NXs@p;cmGPKj*RaX5CR^->i3G58M_#+MY>b{#+rZ^?Y}B z(_-a;MjkHQ%(K5PY{EV0;%Z%sBQA{gZoTkas?i$e;oX`vaj)efbmX~s zhg^OdfyTDRpDVi-h0XqkGOQhns-xU#2>pxsY)XZaid7|6;W;^#VHYHsRWm$`%PW>& z$;rH;U<{GJ$&2=i!Y%PEr`WKN&E-`Gw@RPnWLC(slT)+jFSE;hQV~>sED&JfIV-1? zQ#OQ0wt&%DRTi@8_$fAvxk?Ll*}x>l!FG zJ}hkh$AY3_Mh@r0?ByKRkP|`1n+h+ph+h!0JYpR}tnN)g$#bHha2Rono#Yp|yr^QE z%t&B_X4`P0$ezVTC$H2uQ7u2&Q8k|v`6ijOP?k38C37kW2ZFrJV?)^#&j_uvhJuzI z6|!m*)5)JsvEUd<%-h~`edcg&Vs_%_(W>>Y@M>PpHaGw)cZ$855fpa8jRPd{3Y(Qw z;Eu6bF1xB`z(H)1R|LefIZ;wo7~AGwyO2@UoN{V>e0h0!OjugVX8|jl7Wot}ier+T z9tQ`GgLyM~X&n3e!b%Q$^$NcL&>1}Q>adckRnHLJxa!3>_18zC`GiOl8ZjR!kQ(_1 zy1905J|`bhx4UMEd&E8XNLUahPQ6E2)Ywt0acCUwP*tr&+ltctB2`ori6;2qiiyyWE z0%Qe61%a>!MU_Dp{ug;(Q5CBNvVlb}DqveL2TEj-MP*?poGf#zmPg5nf@(*FMdgs? zp9FVZl;NhZf{G-o2|J`9j^LgWCBy>{1xJ>Qzh^8Oi?!&Klv}l^RLTmZA!{f3)kNkd zY$FT86%g)+CIM8~K9FX?NPO06$%(4vb)w7)+!Q2mGj_e@!w$lLcSRLa?iAvJfBuWb6R3S^$>ucM^`c#Bs}4wIdkNF{B@Y$}y_thtpOC zX`V}E6nxroHSyh#oztvfLxPqIK!Ed~@^3WG$lvr6Oy|b%XA#5KXZre{5TvK4(h=S8 zt$Ut$iQpAdfAhK#m@A(DGPLKmRyuElSTn>HU#kQoAHMp*tN(E3_L<@&V8-_9ba$mA zQk<@|hl`ggp$@>oLs;vKJ|R4w6!}-XUH4R&*bc*PGwkm=f7|0^{zu)9e(`zFU;6*t zum5z`ID6GRdleRpL+_Z-4$PYa^9FOnWNzrpjSADb!^F0kSZTOS-WfHRA(I)>nV||3 z`mp_j_Kke$jgMChhBX;hXV?l8{xI}GsN^YecbM{tPun*=o7@&-^c*#qV{eBmm!diH^v}t@_BzUoiSS(sOHi-`~M&gf%m)6{jlU*iLwOJ3L%oDK9>}pofQz z@HsPluDUX?9Udr;7~w-^_z+CR2Fnv>?1i1!@$K00KbR7THAPYB{Ya*BfP8a0o*8x)7TH>o65=^xxoJv{xSg$%q7F@)Q{ z*=_M0uxkz7{$Xj~=AH+_BSF8O)+JTf-UG_Z)OD)TJ6Jwj>f1aHw5{Vn+By!Dt>eYX zZy2oobg8B6)#(v~9x>?=06@__v7PAg?dWkM`l1fAoc4e~X80ER*M`ZwTUb zY6f?gsVi8kS!#~@D;~W~{mtoM^g8tp9^tMy1!K=V9zqKQxW%Eh6CT_S4;tYEX7~V( znNB-!@S%O#Zr14v)3L+!Y%@Kju7B+NP2Z>c@9f`+jcv!qHWedw!i=4GaMp~yWH1vZ zGojlrD7QmLx9RA{szDE!^ngwefH05m*3>$0(p@_=yG^s@?wXN+0G84w9o?Y^x9P$1 zNrR4?bX=$7PUu`NMars9A2#U2CVdzHXZp4Et7cnY`HD`D8uX}1j{@MVOs{JOz0aig z>GZxYY2U4BJv3m@gC-4eIfy5w)=}xEP9HSrgC>0tfGWrLr*BOcr@sn@)?akWtd^M> z8S)wt?~>0$CtrZZ^5)=T;uLur4?31O72jZ@kc2|kxyzhh{uy5PK_Q!zwobK)i$FUYKV@E7 zl2BgcPs_i8Ib7i?{|XI+C`nd`P|^OY5W%ATS0N(0`&Nkkdh=T$x^?%h5S_aFR)`+G s`EA5};uT3U{Di>mhT`<)3!6im>So+L@S-#EDDU)8)jGiQTig8q4O%uFH2?qr literal 0 HcmV?d00001 diff --git a/discotoolkit/__pycache__/GetMetadata.cpython-311.pyc b/discotoolkit/__pycache__/GetMetadata.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5678d4c13a2b32d510d96cd783adaec76ec0c1da GIT binary patch literal 13674 zcmeHOYiu0Xb)MPhEO*IWkxPmXQ5-%baczo{tXP&M>PcFUNTwW9QXM+VX0Y|J)&_@L~W0E&>oW1k9_uR)h_uTKCbNP3*wKW_aziA%-kFRmu@7O~r36!p1uHoiw zPU1#6iI-eyew1fd*QkqK-J>3M^^SVk)i>(L)tweHfzbfZNgm0YuE_*PgPG81h&}t# zwVArnx=eU9jOR8VIs)}PdMbxgz;#exJ$%-tAiL4}MbJ@xZ z%B;#-&F56daEv#-m$Io`_H!POF_-C2=f=lV*>N*6p~z#Y+i@wSCUfygC5?|Yrx@8G zkT8AL{YBG_Z>EsSj^*MRb=(Zdijq^yY>xF3PX8 zX|x=+lp3KQw|E$A1AA@O+Th~&_YvQd`IxSw6#bhpvQP3dWsVwMCK*kd)%Q&VaGP0OXK=#F4mUaB|kHN9w zSdBuQH9a&2rl82*os?BgHQg9!#YbO)s+Q0u)p#-|$)<l=lps z9T~i!Vd!q8wD`o-?o1+;-NPna>E#|w@s~?8ynkZK6wur_wM}V74fXh`KSVOeeYK7A zhVGnSXx2TghNrdQX?LD~K(Dv3q-mN$E7!5u1!o2Vqz4$kA@A7_KJfJtc zU^KiiFCf1VG5l?d>WaUs;O|-uH{73I?0sjq9_}^5z4M-8xM4o{m5&QV?$<7)bU`!( zu~0t0L@T%?CFO&Ln@#kuHc7aTeuH$qVo3_=uTvx`wxn$kA&3>J(Mhn5k(w6D z95L&`gF)1O5tWZjW-JcAMNLjq^GTo{?VfwDg2}xFBZqqd(k#HoWcv8K|1PvM& z0+pFiNxO*(Ee(uh+-i6{sl{23cHqN4{M7wOz~CA#RCoXIs?fZ6NEdb#dIxl2;9<|l z-(7RNLcT9JB)_L*ZipBknLDSn5oJg`=0e7-rSXdAFkPVEr;<#w;X55Rqh3Xeegsy{ z@N@OmdRcNwo3Vq}Y}P;@+Z5O9Zpl|!{bpTvxqMYAs{xccSG{|jG+U!Wm%E)hbweqSMl-c^%y^u+r9er~H*K!xr5a~`Xlx$ZeM^}h>UYg}Dt&b3 zhQJ0B$1$*DDJ;Y!Qp5-*x3^N-wG@;+-mxjq6d0|{OyHc_op@NKi`gTVXJ|PHs>-y! z2-z>wXA^^wAr<6cz1TaHN@_9j$hz0bM3z+Rgp#|NlAu|yPT3OxGmPC^(~RnoKmscZ zB%d|9G!4axDeYP=>s0o`x$I+=Jb630`TMMT@U9!U2{}Hnwal+uXL$6keS}l?f)Mpfkxic}CzRy1m`8aI zFHJs$^&Yxb*@Y*SMiR4Oj;UM))&%zyJ-tLF`;q(sT@7Ed&Ns&w5{uD)QvPx3>|D_@OBQ?Z?+^%?;OrCNrnVWW~F|yTF$aL%vj7}%M{fN3Z>8z z08E{o%*YC0jphMXA%IUF5Qk(36&K8 zcQ7KQMX1)ncI70#4&bN$772R4*31cw1^>3yX0aHF6&sJPxwx%K*P7qcQu76ekL$t- zLpV_oP7qAdkTDcl2rTv*k#0TMV+4B&o*qjW(Hr})m%eY~)#ES<8I6+rL(fJulDEat zridHK>s$h0yjZy911PSFy;f4!mg;9XmaPPUCJe+r1|L@;7?$mjI7IeYh3rOMU8X2H zK{TL^w9Z_#Y)A|53|`Z;3H8vPJxra+Dg-SplPvY;lyM8A#&7PACni#Rs`T;*17Bst zs%*z_uvwsV;LZ$`AWlk(xtX9V-PUV>TBscP_WQgklnvSq_=t@Zw5(}^7__T z0(ptfU9hI@F%WbhB(nJkwS3tRhL;dh&*qw{+Vb2SH#vo4mLKP!^=zw>mHZgx~`PvDM2%VrqIrcu2g`!P)C&&dv~4XEBS7acI1?M3}5+yufp_f z9S-ya^z}Hb*m&J2t_AwlWucnb+oZ zU_Y@B=tI@6!04tbN+c~fCj(<*fhi+Ih2^Bmtv8Q@h=lJP);z1%##a9mSn)i%NDUuh zVAnR>HXBq4eyeO1ekA-#BUCAFAxh;LFj9Gzl0HgyQ$i};0a$2aFu>A<8ym0$%YbFZ zb=Q59`*nc9kx&VaSeT@QLcC11Df@{!sg})vNDKmEBd&O|Bqf_cufHS;gl=d_<~-4w zO7icKwgvBkh6Qk8st^`&u6eoI`cK&{o|twseG>@?pr#P0fmeijr5vJ!JPKw_3Yy-+ zs4!(UML9~J2nQ&K>AMfE5IHN6nX3V2QGYDRA(R`foWMKQN5U4CecQNc_Y{JWH!1gF zOcCl&v7mCqs^MU!mXMu)=v+rtppvMUGtZ_WPH*A|102YFE^QpW2 z_xl(3t<-EQ)NEUGIYyR^@n;>qzl<(r9-J}yU(#czjMyo?;>fr?qV)wgkBJ4sH>B3UX zPyY?$TY@}?U`J`3EW@nqbk=mGrHYz`=|A;&*ZE)fa16sh9UYB0=rqz5~VU}wS8Sy}?D=#WEI5He!} zF2Ps!!~u8s{nn`wSGCfp#Au|A@f_|tEv)SL!Oj_1MLASsvKbg&6;%V~+GU&ww~_~_ zi>t(Fo=CaMXIEY3iIi>p`b5g!N_#-rF7rgnwxWC@Juqf*$Zg1UFPwWwMtVq#LKF@wPd`Pj_bBI%zw5J?q6Rwj1j#mnUqw;h0(H zD6tEZwOZWB^P*2Q4w}n7H{Pn>a0BcndI)zD@*7N zAKzAP|BM??wp>4{?Z`)t7fQE5fjyKov0FQPZf6YsEc(%*q z8Bo|sA5JkuQNRzfKiHwPXoN~iNA;|9mYm_@_I!Q9_(m$sbkKr*SMiKg@<0zf)x3Ae zrBesMhWP+L>u}BuM?Kv2o^mc}%S~R1I!1Jd{jJ{FZ*n76m29|yo&X>;Kn~nev{HlA zc!O!i%9~ktw%t*-sZ_RpL)onx%Q_@$U8L<#)5ecm?VomR(cdam@Ys=d{V&wLkI`yR z+w%%f|6lk0U)O!H=EM)7E-WZ6j!kA+NVT+u#&*nC%u{A6I-|nY4lJ>EkhXro+{I>` zvhS<($=Vb~k!BHJXa^30mGGcloz!4b=U@!eHZc|chUf5cj`AvC5UfWzYJw^PWGJSB55F_B|RYVk1lGN#{cp>6Siah>TKcnOG zV~T{&>(ef*^oiiY#_jv(&id7T;f&PCO7LVLuJ1Ul`9=ZsKi__r4P-9_veJMv!p((MO9W6#jTG2_%_z4I>&oDHwd3!nOH4S#dN z-@~%4hFTVT^8-ff-sPRfbAx(l$OsMn-tA(pQ2`|H%)((*p(q%8PCPvD(Gk7#8wSpX zR~f-O-^U)bN~RiP&)~z$AI0^~O9sw{%T}g03jWSYrhu{M+|HkLaO&MrhyiZXu z_RjWVsND#4zjtw|clogK%rSk(abw4Elxd|hR2hmyx4kt`3`G}SH$t7Cgm$ikcIMyE zLkEn|fpV{XT=y;<~+Tw2S+@5WLZkoy3LUC00w;KM|#l(ugqu}qLS~m(hx)N!=Ka1Y} zR9ZZkKWDV<)tmNFua7=_wz?-2iU3^*WvD1LeIjgM5w_=pi?h10+YokxS;D|4!toX1 z_`^nBIBf{0(YqFkVYK)Po<>yDwIlDvo#AQ!Y83ZLMj7CAe6Kt0SRr|^+6~@TMM2h%QenyJt%NGNar(8?M2-D z2s^0edF+Yg5wo|gPsG$IN9kC)e2s+ z&Npu?$cq;rT>S0Dk7NHF|0rHKauMhJOV$-B8()M)4x^yeFzn&InNXBB@WKj_e8<`` zc8zjS@&;8zUNlrxELVi;-uvb+wdKM0r#_t0cf4rqcoDDk;3We&JeRC$_=~7%CraY~ zE{0?6ravB+a>;mHi4biXV+zmGZgwj!dw||}fea_6lr;Maqye0&5O>YG)9KvRMEY_< zNhPkPWh=@<%ra|+X#Hnqfd$jZ2dxa#ClBq^x3-!Q+`@i!#QK$!^!*Ga1fv*)Q7HVw zq6-W@u%5HHHtPx-l1IWoCMQj%<)g|xGL!hJzd{Om=J_?3i}xVNgQSMzBXeQ;D{{d( z_E+Q@3gxrN1?Jdak*hD1&mz}caGsc?{DA`Z{Km7$JyY21T=Q+?5eG)H(9JIS26kDt zE)OL3v_3g1@R2pnPMSLT9cvttMK`-FN7-e4a)xi<@x@O1o~1`5`AbXtmpw}mi09e! zL)2RSR(1C+KFUWIlWQC<>y!Qf-?hfsNi)~?+|q&f4}N%X4HC+4wbJ~~>LljA6nDo;pEY>~Qk;%qkq@7=xky?6J$dwTc1(Xc?t=;BeN>i~a>U@0mt<>o-S z&w#;)dw>zv81hl+HD4>e?&}&5{n)r;6u7p;KlN`2;6m+65MaY!!8PB++CDfd=bx-T z5tEcAQc@m!wlyQEsL;zy9`V$(-;g*uCDa~Js~<|@+1++`v(e}qu{V4@%1xpd(VM@a>BOA%tZT;Dl6$8E{D}yZ zS?CC%g=QjS0flKA%G?LkoUavn4Xg7~ui?SG)Z%b`ZeGE|FS}m{m*D&Y=L#GJep`*X RxoE=r3v>z1YNZYJ#y`%duHOIv literal 0 HcmV?d00001 diff --git a/discotoolkit/__pycache__/__init__.cpython-311.pyc b/discotoolkit/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b82f3d3003d824bd4f5685a4acaafc916254dbe3 GIT binary patch literal 375 zcmZ3^%ge<81pQ6RQsw~Z#~=<2FhLogjev~l3@HpLj5!P;5SkH6GeK!)D9r+-nW9)z zn1dNKSza;%HEJ^6V$`Z)HPkcIGxpPDyT$FETH>2pl9-ZMl6Z^DC9^m=-#I6-xcHW! zOMZD?PJUtvSj+_^&F1XtoD!1`tKq}y$yO7J#nX9zi*i^`RFncz;97y( z+-|)8Khb}g|F22fNdYPFuN1J=?s>Q6D^+iuy`1;jMt`Au&6)1Tbx;_h9TTG+bK~v! dA&RoD`I_gwa7qk1^Fb%-XTWulNrAss;2Q^Y6 📦 - 1.1.3 + 1.1.4 diff --git a/docs/scEnrichment.ipynb b/docs/scEnrichment.ipynb old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py index d425795..5732d1c 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='discotoolkit', - version="1.1.3", + version="1.1.4", url='http://www.immunesinglecell.org/', author='Li Mengwei, Rom Uddamvathanak', author_email='uddamvathanak_rom@immunol.a-star.edu.sg',