diff --git a/doc/CallerComparisonPlot.pdf b/doc/CallerComparisonPlot.pdf deleted file mode 100644 index a30922a2..00000000 Binary files a/doc/CallerComparisonPlot.pdf and /dev/null differ diff --git a/doc/iGenVar_Workflow.drawio b/doc/iGenVar_Workflow.drawio index 5e9ca43b..64698b78 100644 --- a/doc/iGenVar_Workflow.drawio +++ b/doc/iGenVar_Workflow.drawio @@ -1,6 +1,6 @@ - + - + @@ -16,7 +16,7 @@ - + @@ -34,7 +34,7 @@ - + diff --git a/doc/iGenVar_Workflow.png b/doc/iGenVar_Workflow.png index 5f722db7..d78fc466 100644 Binary files a/doc/iGenVar_Workflow.png and b/doc/iGenVar_Workflow.png differ diff --git a/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1-multimatch.png b/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1-multimatch.png new file mode 100644 index 00000000..d74a5053 Binary files /dev/null and b/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1-multimatch.png differ diff --git a/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1.png b/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1.png new file mode 100644 index 00000000..6c07b614 Binary files /dev/null and b/doc/plots/2021-06-10_iGenVar_Precision_Recall-F1.png differ diff --git a/doc/plots/2021-06-10_iGenVar_Precision_Recall.png b/doc/plots/2021-06-10_iGenVar_Precision_Recall.png new file mode 100644 index 00000000..52d1a13c Binary files /dev/null and b/doc/plots/2021-06-10_iGenVar_Precision_Recall.png differ diff --git a/doc/plots/CallerComparisonPlot.pdf b/doc/plots/CallerComparisonPlot.pdf new file mode 100644 index 00000000..e1b5af82 Binary files /dev/null and b/doc/plots/CallerComparisonPlot.pdf differ diff --git a/doc/plots/hierarchical_clustering_cutoff.results.all.png b/doc/plots/hierarchical_clustering_cutoff.results.all.png new file mode 100755 index 00000000..dbaee493 Binary files /dev/null and b/doc/plots/hierarchical_clustering_cutoff.results.all.png differ diff --git a/doc/plots/max_overlap.results.all.png b/doc/plots/max_overlap.results.all.png new file mode 100755 index 00000000..2460b093 Binary files /dev/null and b/doc/plots/max_overlap.results.all.png differ diff --git a/doc/plots/max_tol_inserted_length.results.all.png b/doc/plots/max_tol_inserted_length.results.all.png new file mode 100755 index 00000000..d409c632 Binary files /dev/null and b/doc/plots/max_tol_inserted_length.results.all.png differ diff --git a/doc/plots/max_var_length.results.all.png b/doc/plots/max_var_length.results.all.png new file mode 100755 index 00000000..bcfcffe3 Binary files /dev/null and b/doc/plots/max_var_length.results.all.png differ diff --git a/doc/plots/min_var_length.results.all.png b/doc/plots/min_var_length.results.all.png new file mode 100755 index 00000000..e61ec750 Binary files /dev/null and b/doc/plots/min_var_length.results.all.png differ diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp index 20eb0b9a..1de60e88 100644 --- a/include/iGenVar.hpp +++ b/include/iGenVar.hpp @@ -26,12 +26,12 @@ struct cmd_arguments /* -r */ refinement_methods refinement_method{no_refinement}; // default: no refinement // SV specifications: /* -k */ int32_t min_var_length = 30; - /* -l */ int32_t max_var_length = 1000000; + /* -l */ int32_t max_var_length = 100000; /* -m */ int32_t max_tol_inserted_length = 5; /* -n */ int32_t max_overlap = 10; /* -q */ int32_t min_qual = 1; // Clustering specifications: - /* -w */ double hierarchical_clustering_cutoff = 10; + /* -w */ double hierarchical_clustering_cutoff = 100; /* x? */ // Refinement specifications: /* y, z? */ diff --git a/test/benchmark/caller_comparison/workflow/rules/callers.smk b/test/benchmark/caller_comparison/workflow/rules/callers.smk index 14866156..0b7c38e1 100644 --- a/test/benchmark/caller_comparison/workflow/rules/callers.smk +++ b/test/benchmark/caller_comparison/workflow/rules/callers.smk @@ -22,7 +22,7 @@ rule run_igenvar: """ # Defaults: # --clustering_methods hierarchical_clustering --refinement_methods no_refinement - # --max_tol_inserted_length 5 --max_overlap 10 --hierarchical_clustering_cutoff 10 + # --max_tol_inserted_length 5 --max_overlap 10 --hierarchical_clustering_cutoff 100 # SVIM rule run_svim: diff --git a/test/benchmark/parameter_benchmarks/Snakefile b/test/benchmark/parameter_benchmarks/Snakefile index 23a693e0..e1555a0b 100644 --- a/test/benchmark/parameter_benchmarks/Snakefile +++ b/test/benchmark/parameter_benchmarks/Snakefile @@ -1,21 +1,21 @@ rule all: input: - expand("results/plots/{parameter_name}.results.all.png", + expand("results/parameter_benchmarks/plots/{parameter_name}.results.all.png", parameter_name="min_var_length"), - expand("results/plots/{parameter_name}.results.all.png", + expand("results/parameter_benchmarks/plots/{parameter_name}.results.all.png", parameter_name="max_var_length"), - expand("results/plots/{parameter_name}.results.all.png", + expand("results/parameter_benchmarks/plots/{parameter_name}.results.all.png", parameter_name="max_tol_inserted_length"), - expand("results/plots/{parameter_name}.results.all.png", + expand("results/parameter_benchmarks/plots/{parameter_name}.results.all.png", parameter_name="max_overlap"), - expand("results/plots/{parameter_name}.results.all.png", + expand("results/parameter_benchmarks/plots/{parameter_name}.results.all.png", parameter_name="hierarchical_clustering_cutoff") rule run_igenvar: input: bam = "data/long_reads/HG002.Sequel.10kb.pbmm2.hs37d5.whatshap.haplotag.RTG.10x.trio_sorted.bam" output: - vcf = "results/{parameter_name}/{parameter_value}_output.vcf" + vcf = "results/parameter_benchmarks/{parameter_name}/{parameter_value}_output.vcf" shell: """ ./build/iGenVar/bin/iGenVar -t 1 -j {input.bam} -o {output.vcf} --vcf_sample_name HG002 \ @@ -25,9 +25,9 @@ rule run_igenvar: rule filter_vcf: input: - vcf = "results/{parameter_name}/{parameter_value}_output.vcf" + vcf = "results/parameter_benchmarks/{parameter_name}/{parameter_value}_output.vcf" output: - "results/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf" + "results/parameter_benchmarks/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf" shell: "bcftools view -i 'QUAL>={wildcards.min_qual}' {input.vcf} > {output}" @@ -49,12 +49,12 @@ rule tabix: rule truvari: input: - vcf = "results/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf.gz", - index = "results/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf.gz.tbi" + vcf = "results/parameter_benchmarks/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf.gz", + index = "results/parameter_benchmarks/{parameter_name}/{parameter_value}_output.min_qual_{min_qual}.vcf.gz.tbi" params: - output_dir = "results/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}" + output_dir = "results/parameter_benchmarks/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}" output: - summary = "results/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/summary.txt" + summary = "results/parameter_benchmarks/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/summary.txt" conda: "../envs/truvari.yaml" shell: @@ -65,9 +65,9 @@ rule truvari: rule reformat_truvari_results: input: - "results/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/summary.txt" + "results/parameter_benchmarks/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/summary.txt" output: - "results/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/pr_rec.txt" + "results/parameter_benchmarks/truvari/{parameter_name}/{parameter_value}_min_qual_{min_qual}/pr_rec.txt" threads: 1 shell: """ @@ -77,27 +77,27 @@ rule reformat_truvari_results: rule cat_truvari_results_full: input: - input1 = expand("results/truvari/min_var_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", - parameter_value=[10, 30, 50, 100, 500], # default: 30 + input1 = expand("results/parameter_benchmarks/truvari/min_var_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", + parameter_value=[10, 30, 50, 100], # default: 30 min_qual=range(1, 51)), - input2 = expand("results/truvari/max_var_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", - parameter_value=[1000, 10000, 100000, 1000000, 10000000], # default: 1.000.000 + input2 = expand("results/parameter_benchmarks/truvari/max_var_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", + parameter_value=[1000, 10000, 100000, 1000000], # default: 100.000 min_qual=range(1, 51)), - input3 = expand("results/truvari/max_tol_inserted_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", + input3 = expand("results/parameter_benchmarks/truvari/max_tol_inserted_length/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", parameter_value=[1, 10, 100, 1000], # default: 5 min_qual=range(1, 51)), - input4 = expand("results/truvari/max_overlap/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", - parameter_value=[1, 10, 100, 200], # default: 10 + input4 = expand("results/parameter_benchmarks/truvari/max_overlap/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", + parameter_value=[1, 10, 100, 1000], # default: 10 min_qual=range(1, 51)), - input5 = expand("results/truvari/hierarchical_clustering_cutoff/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", - parameter_value= [20, 50, 100, 200, 500, 5000], # default: 10 + input5 = expand("results/parameter_benchmarks/truvari/hierarchical_clustering_cutoff/{parameter_value}_min_qual_{min_qual}/pr_rec.txt", + parameter_value= [20, 50, 100, 1000, 10000], # default: 50 min_qual=range(1, 51)) output: - summary1 = "results/truvari/min_var_length/all_results.txt", - summary2 = "results/truvari/max_var_length/all_results.txt", - summary3 = "results/truvari/max_tol_inserted_length/all_results.txt", - summary4 = "results/truvari/max_overlap/all_results.txt", - summary5 = "results/truvari/hierarchical_clustering_cutoff/all_results.txt" + summary1 = "results/parameter_benchmarks/truvari/min_var_length/all_results.txt", + summary2 = "results/parameter_benchmarks/truvari/max_var_length/all_results.txt", + summary3 = "results/parameter_benchmarks/truvari/max_tol_inserted_length/all_results.txt", + summary4 = "results/parameter_benchmarks/truvari/max_overlap/all_results.txt", + summary5 = "results/parameter_benchmarks/truvari/hierarchical_clustering_cutoff/all_results.txt" shell: """ cat {input.input1} > {output.summary1} && cat {input.input2} > {output.summary2} && \ @@ -107,8 +107,8 @@ rule cat_truvari_results_full: rule plot_pr_all_results: input: - "results/truvari/{parameter_name}/all_results.txt" + "results/parameter_benchmarks/truvari/{parameter_name}/all_results.txt" output: - "results/plots/{parameter_name}.results.all.png" + "results/parameter_benchmarks/plots/{parameter_name}.results.all.png" shell: "Rscript --vanilla Repos/iGenVar/test/benchmark/parameter_benchmarks/plot_all.R {input} {wildcards.parameter_name} {output}" diff --git a/test/benchmark/parameter_benchmarks/plot_all.R b/test/benchmark/parameter_benchmarks/plot_all.R index fa7e2d34..08553efa 100644 --- a/test/benchmark/parameter_benchmarks/plot_all.R +++ b/test/benchmark/parameter_benchmarks/plot_all.R @@ -7,15 +7,15 @@ args = commandArgs(trailingOnly=TRUE) parameter_name = args[2] if (parameter_name == "min_var_length") { - parameter_values = c(10,30,50,100,500) + parameter_values = c(10,30,50,100) } else if (parameter_name == "max_var_length") { - parameter_values = c(1000,10000,100000,1000000,10000000) + parameter_values = c(1000,10000,100000,1000000) } else if (parameter_name == "max_tol_inserted_length") { parameter_values = c(1,10,100,1000) } else if (parameter_name == "max_overlap") { - parameter_values = c(1,10,100,200) + parameter_values = c(1,10,100,1000) } else { # hierarchical_clustering_cutoff - parameter_values = c(20,50,100,200,500,5000) + parameter_values = c(20,50,100,1000,10000) } res <- read_tsv(args[1], col_names = c(parameter_name, "min_qual", "metric", "value")) diff --git a/test/benchmark/parameter_benchmarks/results/results.csv b/test/benchmark/parameter_benchmarks/results/results.csv new file mode 100644 index 00000000..df0ba2ae --- /dev/null +++ b/test/benchmark/parameter_benchmarks/results/results.csv @@ -0,0 +1,13 @@ +min_qual;hierarchical_clustering_cutoff;precision;recall;f1;precision_multimatch;recall_multimatch;f1_multimatch +1;10;0.3809226932668329;0.6674191669094087;0.48502328535139716;0.7191015917993167;0.6760850568016312;0.6969301792348791 +1;50;0.48785706643842947;0.6641421497232741;0.5625115647936841;0.67766076869621;0.6752840081561317;0.6764703007612634 +1;100;0.5523919378339;0.6626128750364113;0.6025029797377831;0.6632894497360917;0.6745557821147684;0.6688751776111002 +1;150;0.5783776900547561;0.6615205359743664;0.6171614919491814;0.6654783881851168;0.6737547334692688;0.6695909872768695 +5;10;0.89578828133045;0.6334838333818817;0.7421405110267456;0.9332441884385929;0.6439702883775124;0.7620796511691026 +5;50;0.9009064688916357;0.636906495776289;0.7462457337883959;0.9287992591830435;0.6479027090008739;0.7633294919214612 +5;100;0.9039810231023102;0.6382901252548792;0.7482499573160322;0.9241706161137441;0.6497960967084183;0.7630688173405814 +5;150;0.9080829015544042;0.6381444800466065;0.7495509366179112;0.9233478350942614;0.6495776288960093;0.7626376596958745 +10;10;0.943620557402199;0.5375036411302068;0.6848844761993133;0.9578490228637119;0.5452956597727935;0.6949576057696104 +10;50;0.9445687298036292;0.5534517914360617;0.697952061713656;0.9538002980625931;0.5618992135158754;0.7071845484392144 +10;100;0.9455081001472754;0.5610253422662395;0.7042047531992687;0.9504537650233015;0.5699825225750073;0.7126139240397029 +10;150;0.9451904296875;0.5638654238275561;0.7063492063492064;0.9487554904831625;0.5727497815321876;0.7142919711108467 diff --git a/test/benchmark/parameter_benchmarks/results/results.xlsm b/test/benchmark/parameter_benchmarks/results/results.xlsm new file mode 100644 index 00000000..82637b1e Binary files /dev/null and b/test/benchmark/parameter_benchmarks/results/results.xlsm differ diff --git a/test/benchmark/parameter_benchmarks/results/truvari_stats.txt b/test/benchmark/parameter_benchmarks/results/truvari_stats.txt new file mode 100644 index 00000000..5881b990 --- /dev/null +++ b/test/benchmark/parameter_benchmarks/results/truvari_stats.txt @@ -0,0 +1,89 @@ +2021-05-31 17:06:29,080 [INFO] Params: +{ + "base": "data/truth_set/HG002_SVs_Tier1_v0.6.vcf.gz", + "comp": "results/2021-05-31_output.vcf.gz", + "output": "results/2021-05-31_truvari_default", + "reference": null, + "giabreport": false, + "debug": false, + "prog": false, + "refdist": 500, + "pctsim": 0.0, + "buffer": 0.1, + "pctsize": 0.7, + "pctovl": 0.0, + "typeignore": false, + "gtcomp": false, + "bSample": null, + "cSample": null, + "sizemin": 50, + "sizefilt": 30, + "sizemax": 50000, + "passonly": false, + "no_ref": false, + "includebed": "data/truth_set/HG002_SVs_Tier1_v0.6.bed", + "multimatch": false +} +2021-05-31 17:06:54,947 [INFO] Stats: { + "TP-base": 9165, + "TP-call": 9165, + "FP": 14895, + "FN": 4567, + "precision": 0.3809226932668329, + "recall": 0.6674191669094087, + "f1": 0.48502328535139716, + "base cnt": 13732, + "call cnt": 24060, + "TP-call_TP-gt": 153, + "TP-call_FP-gt": 9012, + "TP-base_TP-gt": 153, + "TP-base_FP-gt": 9012, + "gt_precision": 0.006359102244389027, + "gt_recall": 0.03241525423728814, + "gt_f1": 0.01063238359972203 +} + +2021-05-31 17:08:44,586 [INFO] Params: +{ + "base": "data/truth_set/HG002_SVs_Tier1_v0.6.vcf.gz", + "comp": "results/2021-05-31_output.vcf.gz", + "output": "results/2021-05-31_truvari_multimatch", + "reference": null, + "giabreport": false, + "debug": false, + "prog": false, + "refdist": 500, + "pctsim": 0.0, + "buffer": 0.1, + "pctsize": 0.7, + "pctovl": 0.0, + "typeignore": false, + "gtcomp": false, + "bSample": null, + "cSample": null, + "sizemin": 50, + "sizefilt": 30, + "sizemax": 50000, + "passonly": false, + "no_ref": false, + "includebed": "data/truth_set/HG002_SVs_Tier1_v0.6.bed", + "multimatch": true +} +2021-05-31 17:09:10,260 [INFO] Stats: { + "TP-base": 9284, + "TP-call": 17257, + "FP": 6741, + "FN": 4448, + "precision": 0.7191015917993167, + "recall": 0.6760850568016312, + "f1": 0.6969301792348791, + "base cnt": 13732, + "call cnt": 16025, + "TP-call_TP-gt": 242, + "TP-call_FP-gt": 17015, + "TP-base_TP-gt": 222, + "TP-base_FP-gt": 9062, + "gt_precision": 0.010084173681140096, + "gt_recall": 0.04753747323340471, + "gt_f1": 0.016638751653840628 +} diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index 96bb923c..aabfddf3 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -105,7 +105,7 @@ std::string const help_page_advanced " Specify what should be the maximum length of your SVs to be\n" " detected. SVs larger than this threshold can still be output as\n" " translocations. This value needs to be non-negative. Default:\n" - " 1000000.\n" + " 100000.\n" " -m, --max_tol_inserted_length (signed 32 bit integer)\n" " Specify what should be the longest tolerated inserted sequence at\n" " sites of non-INS SVs. This value needs to be non-negative. Default:\n" @@ -119,7 +119,7 @@ std::string const help_page_advanced " needs to be non-negative. Default: 1.\n" " -w, --hierarchical_clustering_cutoff (double)\n" " Specify the distance cutoff for the hierarchical clustering. This\n" - " value needs to be non-negative. Default: 10.\n" + " value needs to be non-negative. Default: 100.\n" }; // std::string expected_res_default @@ -536,7 +536,8 @@ TEST_F(iGenVar_cli_test, dataset_single_end_mini_example) "-j", data("single_end_mini_example.sam"), "--verbose", "--method cigar_string --method split_read " - "--min_var_length 8 --max_var_length 400"); + "--min_var_length 8 --max_var_length 400 " + "--hierarchical_clustering_cutoff 20"); // Check the output of junctions: seqan3::debug_stream << "Check the output of junctions... " << '\n';