Skip to content

Commit

Permalink
Merge pull request #158 from Irallia/FEATURE/improve_performance_by_e…
Browse files Browse the repository at this point in the history
…ldariont

[FEATURE] improve performance by eldariont
  • Loading branch information
Irallia authored Sep 22, 2021
2 parents c6a600a + 2c6d828 commit f31f991
Show file tree
Hide file tree
Showing 20 changed files with 284 additions and 150 deletions.
Binary file modified doc/plots/CallerComparisonPlot.pdf
Binary file not shown.
Binary file modified doc/plots/hierarchical_clustering_cutoff.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified doc/plots/max_overlap.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/plots/max_tol_deleted_length.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified doc/plots/max_tol_inserted_length.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified doc/plots/max_var_length.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified doc/plots/min_var_length.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/plots/partition_max_distance.results.all.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 4 additions & 2 deletions include/iGenVar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ struct cmd_arguments
// SV specifications:
/* -k */ int32_t min_var_length = 30;
/* -l */ int32_t max_var_length = 100000;
/* -m */ int32_t max_tol_inserted_length = 5;
/* -m */ int32_t max_tol_inserted_length = 50;
/* -e */ int32_t max_tol_deleted_length = 50;
/* -n */ int32_t max_overlap = 10;
/* -q */ int32_t min_qual = 1;
// Clustering specifications:
/* -w */ double hierarchical_clustering_cutoff = 100;
/* -p */ int32_t partition_max_distance = 1000;
/* -w */ double hierarchical_clustering_cutoff = 0.5;
/* x? */
// Refinement specifications:
/* y, z? */
Expand Down
26 changes: 21 additions & 5 deletions include/modules/clustering/hierarchical_clustering_method.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,31 @@
* The returned partitions contain junctions meeting the following criteria:
* a) all junctions in a partition connect the same reference sequences,
* b) all junctions in a partition have the same orientations, and
* c) the distance between corresponding mates of two neighboring junctions is at most 50bp.
* c) the distance between corresponding mates of two neighboring junctions is
* smaller or equal than the given partition_max_distance.
* The junctions in each of the returned partitions are sorted even though
* the partitions themselves are not returned in a particular order.
*
* \param[in] junctions - a vector of junctions (needs to be sorted)
* \param[in] partition_max_distance - maximum distance between junctions in the same partition
*
* \returns Returns unordered partitions of sorted junctions.
*/
std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> const & junctions);
std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> const & junctions,
int32_t const partition_max_distance);

/*! \brief Sub-partition an existing partition based on the second mate of each junction.
* The junctions in each of the returned sub-partitions are sorted even though
* the sub-partitions themselves are not returned in a particular order.
*
* \param[in] partition - a partition (i.e. a vector) of junctions (needs to be sorted
* by their second mates)
* \param[in] partition_max_distance - maximum distance between junctions in the same partition
*
* \returns Returns unordered sub-partitions of sorted junctions based on the second mate of each junction.
*/
std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Junction> const & partition);
std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Junction> const & partition,
int32_t const partition_max_distance);

/*! \brief Compute the distance between two junctions.
* For two junctions that connect the same reference sequences and have the same
Expand All @@ -31,16 +40,23 @@ std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Ju
*
* \param[in] lhs - left side junction
* \param[in] rhs - right side junction
*
* \returns Returns distance between two junctions.
*/
int junction_distance(Junction const & lhs, Junction const & rhs);
double junction_distance(Junction const & lhs, Junction const & rhs);

/*! \brief Cluster junctions by an hierarchical clustering method.
* The returned clusters and the junctions in each returned cluster are sorted.
*
* \param[in] junctions - a vector of junctions (needs to be sorted)
* \param[in] partition_max_distance - maximum distance between junctions in the same partition
* \param[in] clustering_cutoff - distance cutoff for clustering
*
* \returns Returns sorted clusters of sorted junctions.
*
* \details For the algorithms we use the library hclust.
* \see https://lionel.kr.hs-niederrhein.de/~dalitz/data/hclust/ (last access 01.06.2021).
*/
std::vector<Cluster> hierarchical_clustering_method(std::vector<Junction> const & junctions, double clustering_cutoff);
std::vector<Cluster> hierarchical_clustering_method(std::vector<Junction> const & junctions,
int32_t const partition_max_distance,
double clustering_cutoff);
12 changes: 11 additions & 1 deletion src/iGenVar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
"Specify what should be the longest tolerated inserted sequence at sites of non-INS SVs. "
"This value needs to be non-negative.",
seqan3::option_spec::advanced);
parser.add_option(args.max_tol_deleted_length, 'e', "max_tol_deleted_length",
"Specify what should be the longest tolerated deleted sequence at sites of non-DEL SVs. "
"This value needs to be non-negative.",
seqan3::option_spec::advanced);
parser.add_option(args.max_overlap, 'n', "max_overlap",
"Specify the maximum allowed overlap between two alignment segments. "
"This value needs to be non-negative.",
Expand All @@ -104,6 +108,10 @@ void initialize_argument_parser(seqan3::argument_parser & parser, cmd_arguments
seqan3::option_spec::advanced);

// Options - Clustering specifications:
parser.add_option(args.partition_max_distance, 'p', "partition_max_distance",
"Specify the maximum distance in bp between members of the same partition."
"This value needs to be non-negative.",
seqan3::option_spec::advanced);
parser.add_option(args.hierarchical_clustering_cutoff, 'w', "hierarchical_clustering_cutoff",
"Specify the distance cutoff for the hierarchical clustering. "
"This value needs to be non-negative.",
Expand Down Expand Up @@ -155,7 +163,9 @@ void detect_variants_in_alignment_file(cmd_arguments const & args)
clusters = simple_clustering_method(junctions);
break;
case 1: // hierarchical clustering
clusters = hierarchical_clustering_method(junctions, args.hierarchical_clustering_cutoff);
clusters = hierarchical_clustering_method(junctions,
args.partition_max_distance,
args.hierarchical_clustering_cutoff);
break;
case 2: // self-balancing_binary_tree,
seqan3::debug_stream << "The self-balancing binary tree clustering method is not yet implemented\n";
Expand Down
71 changes: 53 additions & 18 deletions src/modules/clustering/hierarchical_clustering_method.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

#include "fastcluster.h" // for hclust_fast

std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> const & junctions)
std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> const & junctions,
int32_t const partition_max_distance)
{
// Partition based on mate 1
std::vector<Junction> current_partition{};
Expand All @@ -25,13 +26,14 @@ std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> con
{
if (junction.get_mate1().seq_name != current_partition.back().get_mate1().seq_name ||
junction.get_mate1().orientation != current_partition.back().get_mate1().orientation ||
abs(junction.get_mate1().position - current_partition.back().get_mate1().position) > 50)
std::abs(junction.get_mate1().position - current_partition.back().get_mate1().position)
> partition_max_distance)
{
// Partition based on mate 2
std::sort(current_partition.begin(), current_partition.end(), [](Junction const & a, Junction const & b) {
return a.get_mate2() < b.get_mate2();
});
current_partition_splitted = split_partition_based_on_mate2(current_partition);
current_partition_splitted = split_partition_based_on_mate2(current_partition, partition_max_distance);
for (std::vector<Junction> partition : current_partition_splitted)
{
final_partitions.push_back(partition);
Expand All @@ -46,7 +48,7 @@ std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> con
std::sort(current_partition.begin(), current_partition.end(), [](Junction a, Junction b) {
return a.get_mate2() < b.get_mate2();
});
current_partition_splitted = split_partition_based_on_mate2(current_partition);
current_partition_splitted = split_partition_based_on_mate2(current_partition, partition_max_distance);
for (std::vector<Junction> partition : current_partition_splitted)
{
final_partitions.push_back(partition);
Expand All @@ -55,7 +57,8 @@ std::vector<std::vector<Junction>> partition_junctions(std::vector<Junction> con
return final_partitions;
}

std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Junction> const & partition)
std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Junction> const & partition,
int32_t const partition_max_distance)
{
std::vector<Junction> current_partition{};
std::vector<std::vector<Junction>> splitted_partition{};
Expand All @@ -70,7 +73,8 @@ std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Ju
{
if (junction.get_mate2().seq_name != current_partition.back().get_mate2().seq_name ||
junction.get_mate2().orientation != current_partition.back().get_mate2().orientation ||
abs(junction.get_mate2().position - current_partition.back().get_mate2().position) > 50)
std::abs(junction.get_mate2().position - current_partition.back().get_mate2().position)
> partition_max_distance)
{
std::sort(current_partition.begin(), current_partition.end());
splitted_partition.push_back(current_partition);
Expand All @@ -87,25 +91,54 @@ std::vector<std::vector<Junction>> split_partition_based_on_mate2(std::vector<Ju
return splitted_partition;
}

int junction_distance(Junction const & lhs, Junction const & rhs)
double junction_distance(Junction const & lhs, Junction const & rhs)
{
// lhs and rhs connect the same chromosomes with the same orientations
if ((lhs.get_mate1().seq_name == rhs.get_mate1().seq_name) &&
(lhs.get_mate1().orientation == rhs.get_mate1().orientation) &&
(lhs.get_mate2().seq_name == rhs.get_mate2().seq_name) &&
(lhs.get_mate2().orientation == rhs.get_mate2().orientation))
{
// Reference: ................
// Junction 1 with mates A and B: A------->B (2bp inserted)
// Junction 2 with mates C and D: C------>D (5bp inserted)
// Distance = 1 (distance A-C) + 2 (distance B-D) + 0 (no tandem duplication) + 3 (absolute insertion size difference)
return (std::abs(lhs.get_mate1().position - rhs.get_mate1().position) +
std::abs(lhs.get_mate2().position - rhs.get_mate2().position) +
std::abs((int)(lhs.get_tandem_dup_count() - rhs.get_tandem_dup_count())) +
std::abs((int)(lhs.get_inserted_sequence().size() - rhs.get_inserted_sequence().size())));
// lhs and rhs are intra-chromosomal adjacencies
if (lhs.get_mate1().seq_name == lhs.get_mate2().seq_name)
{
// the directed size is the directed distance between both mates
// the directed size is positive for insertions and negative for deletions
int32_t lhs_directed_size = lhs.get_inserted_sequence().size() +
lhs.get_mate1().position -
lhs.get_mate2().position;
int32_t rhs_directed_size = rhs.get_inserted_sequence().size() +
rhs.get_mate1().position -
rhs.get_mate2().position;
// lhs and rhs have the same type (either deletion/inversion or insertion)
if ((lhs_directed_size < 0 && rhs_directed_size < 0) ||
(lhs_directed_size > 0 && rhs_directed_size > 0))
{
double position_distance = std::abs(lhs.get_mate1().position - rhs.get_mate1().position) / 1000.0;
// TODO (irallia 01.09.2021): std::abs((int)(lhs.get_tandem_dup_count() - rhs.get_tandem_dup_count()))
double size_distance = ((double)(std::max(std::abs(lhs_directed_size), std::abs(rhs_directed_size))) /
(double)(std::min(std::abs(lhs_directed_size), std::abs(rhs_directed_size)))) - 1.0;
return position_distance + size_distance;
}
// lhs and rhs have different types
else
{
return std::numeric_limits<double>::max();
}
}
// lhs and rhs are inter-chromosomal adjacencies
else
{
double position_distance1 = std::abs(lhs.get_mate1().position - rhs.get_mate1().position) / 1000.0;
double position_distance2 = std::abs(lhs.get_mate2().position - rhs.get_mate2().position) / 1000.0;
// TODO (irallia 01.09.2021): std::abs((int)(lhs.get_tandem_dup_count() - rhs.get_tandem_dup_count()))
double size_distance = std::abs((double)(lhs.get_inserted_sequence().size() - rhs.get_inserted_sequence().size())) / 1000.0;
return position_distance1 + position_distance2 + size_distance;
}
}
else
{
return std::numeric_limits<int>::max();
return std::numeric_limits<double>::max();
}
}

Expand All @@ -118,9 +151,11 @@ inline std::vector<Junction> subsample_partition(std::vector<Junction> const & p
return subsample;
}

std::vector<Cluster> hierarchical_clustering_method(std::vector<Junction> const & junctions, double clustering_cutoff)
std::vector<Cluster> hierarchical_clustering_method(std::vector<Junction> const & junctions,
int32_t const partition_max_distance,
double clustering_cutoff)
{
auto partitions = partition_junctions(junctions);
auto partitions = partition_junctions(junctions, partition_max_distance);
std::vector<Cluster> clusters{};
// Set the maximum partition size that is still feasible to cluster in reasonable time
// A trade-off between reducing runtime and keeping as many junctions as possible has to be made
Expand Down
19 changes: 12 additions & 7 deletions src/variant_detection/variant_detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,13 @@ void detect_junctions_in_short_reads_sam_file(std::vector<Junction> & junctions,
break;
}
}

num_good++;
if (num_good % 1000 == 0)
if (gVerbose)
{
seqan3::debug_stream << num_good << " good alignments from short read file." << std::endl;
++num_good;
if (num_good % 100000 == 0)
{
seqan3::debug_stream << num_good << " good alignments from short read file." << std::endl;
}
}
}
}
Expand Down Expand Up @@ -174,10 +176,13 @@ void detect_junctions_in_long_reads_sam_file(std::vector<Junction> & junctions,
}
}

num_good++;
if (num_good % 1000 == 0)
if (gVerbose)
{
seqan3::debug_stream << num_good << " good alignments from long read file." << std::endl;
++num_good;
if (num_good % 100000 == 0)
{
seqan3::debug_stream << num_good << " good alignments from long read file." << std::endl;
}
}
}
}
20 changes: 11 additions & 9 deletions src/variant_detection/variant_output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
size_t insert_size = clusters[i].get_average_inserted_sequence_size();
if (mate1.orientation == strand::forward)
{
int32_t distance = mate2_pos - mate1_pos;
//Deletion
if (distance >= args.min_var_length &&
distance <= args.max_var_length &&
int32_t distance = mate2_pos - mate1_pos - 1;
int32_t sv_length = insert_size - distance;
// Deletion (sv_length is negative)
if (-sv_length >= args.min_var_length &&
-sv_length <= args.max_var_length &&
insert_size <= args.max_tol_inserted_length)
{
variant_record tmp{};
Expand All @@ -45,15 +46,16 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
tmp.add_info("SVTYPE", "DEL");
// Increment position by 1 because VCF is 1-based
tmp.set_pos(mate1_pos + 1);
tmp.add_info("SVLEN", std::to_string(-distance + 1));
tmp.add_info("SVLEN", std::to_string(sv_length));
// Increment end by 1 because VCF is 1-based
// Decrement end by 1 because deletion ends one base before mate2 begins
tmp.add_info("END", std::to_string(mate2_pos));
tmp.print(out_stream);
}
//Insertion
else if (distance == 1 &&
insert_size >= args.min_var_length)
// Insertion (sv_length is positive)
else if (sv_length >= args.min_var_length &&
sv_length <= args.max_var_length &&
distance <= args.max_tol_deleted_length)
{
variant_record tmp{};
tmp.set_chrom(mate1.seq_name);
Expand All @@ -62,7 +64,7 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
tmp.add_info("SVTYPE", "INS");
// Increment position by 1 because VCF is 1-based
tmp.set_pos(mate1_pos + 1);
tmp.add_info("SVLEN", std::to_string(insert_size));
tmp.add_info("SVLEN", std::to_string(sv_length));
// Increment end by 1 because VCF is 1-based
tmp.add_info("END", std::to_string(mate1_pos + 1));
tmp.print(out_stream);
Expand Down
Loading

0 comments on commit f31f991

Please sign in to comment.