diff --git a/README.md b/README.md index 6ca1990..cc90690 100644 --- a/README.md +++ b/README.md @@ -1,253 +1,16 @@ # Scoary 2 -Associate genes with traits! +Scoary 2 associates orthogenes (e.g. generated using [OrthoFinder][orthofinder] +or [Roary][roary] to traits. It reports a list of genes sorted by strength of +association per trait. The results can be explored interactively with a simple, static HTML/JS app. -# Installation +# Wiki -1) [Python](https://pypi.org/project/scoary-2/) 3.10+: `pip install scoary-2` -2) [Docker](https://hub.docker.com/r/troder/scoary-2): `docker pull troder/scoary-2` - -# Usage - -Examples: - -1) Dataset from Scoary 1 - -```shell -# Dataset from Scoary 1: genes in Roary gene count format -scoary \ - --genes Gene_presence_absence.csv \ - --traits Tetracycline_resistance.csv \ - --outdir out \ - --n-permut 1000 -``` - -See output [here](https://gfv-oberburg.ch/FILES/SCOARY2_TETR/overview.html). - -2) OrthoFinder-based data - -```shell - scoary \ - --genes N0.tsv \ - --gene-data-type 'gene-list:\t' \ - --gene-info N0_best_names.tsv \ - --traits traits.tsv \ - --trait-data-type 'gaussian:\t' \ - --trait-info trait_info.tsv \ - --isolate-info isolate_info.tsv \ - --n-permut 200 \ - --n-cpus 7 \ - --outdir out -# the following are optional: gene-info, trait-info, isolate-info -``` - -See output of a huge, unfiltered dataset [here](https://gfv-oberburg.ch/FILES/SCOARY2.2/overview.html). -(Takes a few seconds to load.) - -## Docker cookbook - -Run interactive shell: - -```shell -docker run \ - -u $(id -u ${USER}):$(id -g ${USER}) \ - -it --rm \ - -v /path/to/data:/data:Z \ - troder/scoary-2 \ - /bin/bash -``` - -## Options - -Below is the output of `scoary --help`: - -```text -POSITIONAL ARGUMENTS - GENES - Type: str - Path to gene presence/absence table: columns=isolates, rows=genes - TRAITS - Type: str - Path to trait presence/absence table: columns=traits, rows=isolates - OUTDIR - Type: str - Directory to place output files - -FLAGS - --multiple_testing=MULTIPLE_TESTING - Type: str - Default: 'bonferroni:0.999' - "method:cutoff" for filtering genes after Fisher's test, where cutoff is a number and method is one of [bonferroni, sidak, holm-sidak, holm, simes-hochberg, hommel, fdr_bh, fdr_by, fdr_tsbh, fdr_tsbky] - --gene_info=GENE_INFO - Type: Optional[str] - Default: None - Path to file that describes genes: columns=arbitrary properties, rows=genes - --trait_info=TRAIT_INFO - Type: Optional[str] - Default: None - Path to file that describes traits: columns=arbitrary properties, rows=traits - --isolate_info=ISOLATE_INFO - Type: Optional[str] - Default: None - Path to file that describes isolates: columns=arbitrary properties, rows=isolates - --newicktree=NEWICKTREE - Type: Optional[str] - Default: None - Path to a custom tree in Newick format - --pairwise=PAIRWISE - Type: bool - Default: True - If False, only perform Fisher's test. If True, also perform pairwise comparisons algorithm. - --n_permut=N_PERMUT - Type: int - Default: 500 - Post-hoc label-switching test: perform N permutations of the phenotype by random label switching. Low p-values suggest that the effect is not merely lineage-specific. - --restrict_to=RESTRICT_TO - Type: Optional[str] - Default: None - Comma-separated list of isolates to which to restrict this analysis - --ignore=IGNORE - Type: Optional[str] - Default: None - Comma-separated list of isolates to be ignored for this analysis - --n_cpus=N_CPUS - Type: int - Default: 1 - --trait_data_type=TRAIT_DATA_TYPE - Type: str - Default: 'binary:,' - "::::" How to read the traits table. Example: "gene-list:\t" for OrthoFinder N0.tsv table - --gene_data_type=GENE_DATA_TYPE - Type: str - Default: 'gene-count:,' - ":" How to read the genes table. Example: "gene-list:\t" for OrthoFinder N0.tsv table - --random_state=RANDOM_STATE - Type: Optional[int] - Default: None - Set a fixed seed for the random number generator - --limit_traits=LIMIT_TRAITS - Type: Optional[int, int] - Default: None - Limit the analysis to traits n to m. Useful for debugging. Example: "(0, 10)" -``` - -# Overview of the algorithm - -![algorithm flowchart](media/ScoaryWorkflow.drawio.svg) - -# Usage of the picking algorithm in Python - -Python bindings to the _pairwise comparisons algorithm_, as described in -[Read, 1995](https://doi.org/10.1006/jtbi.1995.0047), -[Maddison, 2000](https://doi.org/10.1006/jtbi.1999.1050) and -[Brynildsrud, 2016](https://doi.org/10.1186/s13059-016-1108-8). - -
- - Simple pair picking - -```python -from pprint import pprint -from scoary import ScoaryTree, pick_single, print_tree - -tree = [['isolate1', 'isolate2'], [['isolate3', 'isolate4'], ['isolate5', 'isolate6']]] - -label_to_trait_a = { - 'isolate1': True, - 'isolate2': False, - 'isolate3': True, - 'isolate4': False, - 'isolate5': True, - 'isolate6': False, -} - -label_to_trait_b = { - 'isolate1': True, - 'isolate2': False, - 'isolate3': True, - 'isolate4': False, - 'isolate5': True, - 'isolate6': False, -} - -print_tree( - ScoaryTree.from_list(tree), - label_to_trait_a, label_to_trait_b -) -# /-11_isolate1 -# /-| -# | \-00_isolate2 -# | -# --| /-11_isolate3 -# | /-| -# | | \-00_isolate4 -# \-| -# | /-11_isolate5 -# \-| -# \-00_isolate6 - -result = pick_single(tree, label_to_trait_a, label_to_trait_b, calc_pvals=True) -pprint(result) -# {'best_fisher_p': 0.25, -# 'max_contrasting_pairs': 3, -# 'max_opposing_pairs': 0, -# 'max_supporting_pairs': 3, -# 'worst_pval': 0.25} - ``` - -
- -
- - Parallel pair picking - -This takes advantage of [Numba](https://numba.pydata.org/) optimizations. - -```python -import pandas as pd -from scoary import pick - -tree = [['isolate1', 'isolate2'], ['isolate3', 'isolate4']] - -# e.g. phenotype -label_to_trait_a = { - 'isolate1': True, - 'isolate2': False, - 'isolate3': False, - 'isolate4': True, -} - -# e.g. presence/absence of genes -trait_b_df = pd.DataFrame( - columns=['isolate1', 'isolate2', 'isolate3', 'isolate4'], - data=[ - [True, True, False, False], # gene 1 - [True, False, True, False], # gene 2 - [True, False, False, True], # ... - [False, True, True, False], - [False, True, False, True], - [False, True, False, True], - [False, True, False, True], - [False, True, False, True], - ] -) - -max_contr, max_suppo, max_oppos, best, worst = pick( - tree=tree, - label_to_trait_a=label_to_trait_a, - trait_b_df=trait_b_df, - calc_pvals=True -) - -print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}') -# max_contr=array([1, 2, 2, 2, 2, 2, 2, 2]) -# max_suppo=array([1, 1, 2, 0, 1, 1, 1, 1]) -# max_oppos=array([1, 1, 0, 2, 1, 1, 1, 1]) -# best=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ]) -# worst=array([1. , 1. , 0.5, 0.5, 1. , 1. , 1. , 1. ]) -``` - -
+- [Installation](wiki/Installation) +- [Usage](wiki/Usage) +- [Input](wiki/Input) +- [Output](wiki/Output) +- [Tutorial](wiki/Tutorial) # Todo: @@ -263,3 +26,7 @@ print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}') - [ ] `overview.html` & `trait.html`: ~~set proper title~~, add logo/favicon - [ ] `overview.html` & `trait.html`: link to table download - [ ] Improve `README.md` + +[orthofinder]: https://github.com/davidemms/OrthoFinder/ + +[roary]: https://sanger-pathogens.github.io/Roary/ diff --git a/media/ScoaryWorkflow.drawio.svg b/media/ScoaryWorkflow.drawio.svg deleted file mode 100644 index 4a82bdf..0000000 --- a/media/ScoaryWorkflow.drawio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -calculate_confidence_intervalTrait data
Binary traits
Binary traits
Numeric traits
Numeric traits
Gene data
Binary matrix
(Roary)
Binary matrix(Roary)
Gene matrix
(Orthofinder)
Gene matrix...
VCF
format
VCF...
Trait matrix
(boolean)
Trait matrix...
Gene matrix
(boolean)
Gene matrix...
load_traits
load_traits
load_genes
load_genes
pair_picking
pair_picking
for each
trait
for each...
init_result_dfindex: genesadd col: contingency tableadd col: specificityadd col: sensitivityPhylogeny data(optional)
Newick file
Newick file
Tree object
(ScoaryTree class)
Tree object...
parse_newick
parse_newick
ScoaryTree.prune
ScoaryTree.prune
create_test_dfindex: contingency tableadd col: Fisher-unique tablesadd col: Fisher's pvalue
add_odds_ratio
add_odds_ratio
perform_multiple_testing_correction
(initial filtration on Fisher's pvalue)
perform_multiple_testing_correction(...
merge test_df and result_df
(left outer join on contingency table)
merge test_df and result_df...
for each
gene
for eachgene
permute_trait
permute_trait
pair_picking
pair_picking
empirical pvalue
empirical pvalue
create_summarycalculate dendrograminteractive output (html/js)
Speed up using...

faster algorithm

numba

caching

parallel
Speed up using......
Trait output
(tsv, html)
Trait output...
Summary
(html)
Summary...
ScoaryTree \.from_presence_absencehemming distancesupgma algorithmsave_trait_resultinteractive output (html/js)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/scoary/load_traits.py b/scoary/load_traits.py index 24e77be..4d71faf 100644 --- a/scoary/load_traits.py +++ b/scoary/load_traits.py @@ -38,6 +38,7 @@ def filter_df(df: pd.DataFrame, restrict_to: str = None, ignore: str = None) -> def load_binary(traits: str, delimiter: str, restrict_to: str = None, ignore: str = None, limit_traits: (int, int) = None): + logger.debug(f'Loading binary traits: {traits=} {delimiter=}') dtypes = defaultdict(lambda: int) dtypes["index_column"] = str traits_df = pd.read_csv(traits, delimiter=delimiter, index_col=0, dtype=dtypes, na_values=STR_NA_VALUES) @@ -71,6 +72,7 @@ def load_binary(traits: str, delimiter: str, restrict_to: str = None, ignore: st def load_numeric(traits: str, delimiter: str, restrict_to: str = None, ignore: str = None, limit_traits: (int, int) = None): + logger.debug(f'Loading numeric traits: {traits=} {delimiter=}') dtypes = defaultdict(lambda: float) dtypes["index_column"] = str numeric_df = pd.read_csv(traits, delimiter=delimiter, index_col=0, dtype=dtypes, na_values=STR_NA_VALUES) @@ -236,7 +238,7 @@ def fn_km(kmeans, gm, ns, trait, proc_id, container_binarized_traits, container_ def fn_gm(kmeans, gm, ns, trait, proc_id, container_binarized_traits, container_binarization_info, extract_covariances): print_status(ns, trait, proc_id) - logger.debug(f'Binarizing {trait=} using gaussian mixture') + logger.debug(f'Binarizing {trait=} using gaussian mixture.') try: binarized_data, metadata = apply_gm(gm, ns.numeric_df[trait], certainty_cutoff=ns.cutoff, extract_covariances=extract_covariances) @@ -333,6 +335,8 @@ def binarize( from .init_multiprocessing import init, mp mgr, ns, counter, lock = init() + logger.info(f'Binarizing traits: {method=} {alternative=} {cutoff=} {covariance_type=} {n_cpus=} {random_state=}') + ns = BinarizeTraitNamespace.create_namespace(ns, { 'start_time': datetime.now(), 'lock': lock, diff --git a/scoary/scoary.py b/scoary/scoary.py index 6a64515..960bbc8 100644 --- a/scoary/scoary.py +++ b/scoary/scoary.py @@ -47,7 +47,8 @@ def scoary( Low p-values suggest that the effect is not merely lineage-specific. :param restrict_to: Comma-separated list of isolates to which to restrict this analysis :param ignore: Comma-separated list of isolates to be ignored for this analysis - :param n_cpus: Number of CPUs that should be used + :param n_cpus: Number of CPUs that should be used. There is overhead in multiprocessing, so if the dataset is + small, use n_cpus=1 :param trait_data_type: "::::" How to read the traits table. Example: "gene-list:\\t" for OrthoFinder N0.tsv table :param gene_data_type: ":" How to read the genes table. Example: "gene-list:\\t" for diff --git a/scoary/utils.py b/scoary/utils.py index f96978d..faf4e27 100644 --- a/scoary/utils.py +++ b/scoary/utils.py @@ -226,7 +226,7 @@ def load_info_file( info_df = pd.read_csv(info_file, index_col=0, delimiter='\t') assert info_df.index.name == merge_col, \ - f'The file {info_file} is improperly formatted: The first column must be named "{merge_col}".' \ + f'The file {info_file} is improperly formatted: The first column must be named "{merge_col}". ' \ f'Current name: {info_df.index.name}. Remaining columns: {info_df.columns.tolist()}' if expected_overlap_set is not None: diff --git a/tests/init_tests.py b/tests/init_tests.py index 71d3fc5..976eab2 100644 --- a/tests/init_tests.py +++ b/tests/init_tests.py @@ -22,6 +22,7 @@ }, 'tetracycline': { 'genes': 'Gene_presence_absence.csv', + 'gene-info': 'gene-info.tsv', 'traits': 'Tetracycline_resistance.csv', 'traits-numeric': 'Tetracycline_resistance_numeric.csv', 'restrict_to': 'Restrict_to.csv', diff --git a/tests/test_scoary.py b/tests/test_scoary.py index 12cbb2e..c4b769e 100644 --- a/tests/test_scoary.py +++ b/tests/test_scoary.py @@ -38,6 +38,16 @@ def test_scoary_multi_threaded(self): outdir=self.tempdir ) + def test_scoary_gene_info(self): + scoary( + genes=get_path('tetracycline', 'genes'), + gene_info=get_path('tetracycline', 'gene-info'), + traits=get_path('tetracycline', 'traits'), + n_permut=1000, + n_cpus=1, + outdir=self.tempdir + ) + def test_scoary_long(self): scoary( genes=get_path('new_ds', 'genes-hog'), @@ -72,6 +82,24 @@ def test_scoary_long_numeric(self): pairwise=False ) + def test_scoary_gauss_kmeans(self): + scoary( + genes=get_path('new_ds', 'genes-hog'), + gene_info=get_path('new_ds', 'genes-hog-info'), + gene_data_type='gene-list:2', + traits=get_path('new_ds', 'traits-lc'), + trait_data_type=f'gaussian:kmeans:\t', + trait_info=get_path('new_ds', 'traits-lc-meta'), + isolate_info=get_path('new_ds', 'isolate-meta'), + n_permut=200, + restrict_to=RESTRICT_TO, + random_state=42, + n_cpus=7, + outdir=self.tempdir, + limit_traits=(0, 100), + pairwise=False + ) + def test_scoary_real(self): scoary( multiple_testing='bonferroni:0.1',