Skip to content

Commit

Permalink
Add --anchor-samples option to add specific sequences to subtrees
Browse files Browse the repository at this point in the history
Add option --anchor-samples to usher-sampled and usher-sampled-server, and corresponding option --usher-anchor-samples to matUtils, to specify a file containing sample names that should be added to subtrees after the usual random selection.  This was a feature request from influenza collaborators, so that vaccine strains could be added to subtrees to give the larger-scale context of familiar sequences.
  • Loading branch information
AngieHinrichs committed May 21, 2024
1 parent 5f417da commit e541c00
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 36 deletions.
25 changes: 18 additions & 7 deletions src/matOptimize/mutation_annotated_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ void Mutation_Annotated_Tree::Tree::rotate_for_display(bool reverse) {
void Mutation_Annotated_Tree::get_random_single_subtree(
const Mutation_Annotated_Tree::Tree & T, std::vector<Node *> samples,
std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx,
bool retain_original_branch_len) {
bool retain_original_branch_len, std::vector<Node*> anchor_samples) {
// timer.Start();
std::string preid = "/";
if (use_tree_idx) {
Expand All @@ -436,6 +436,9 @@ void Mutation_Annotated_Tree::get_random_single_subtree(
}
}

// Add "anchor samples" (if any)
leaves_to_keep_set.insert(anchor_samples.begin(), anchor_samples.end());

std::vector<Node *> leaves_to_keep(leaves_to_keep_set.begin(), leaves_to_keep_set.end());

auto new_T = get_subtree(T, leaves_to_keep);
Expand All @@ -445,10 +448,15 @@ void Mutation_Annotated_Tree::get_random_single_subtree(

// Write subtree to file
auto subtree_filename = outdir + preid + "single-subtree.nh";
fprintf(
stderr,
"Writing single subtree with %zu randomly added leaves to file %s.\n",
subtree_size, subtree_filename.c_str());
if (anchor_samples.size() > 0) {
fprintf(stderr,
"Writing single subtree with %zu randomly added leaves and %zu anchor samples to file %s.\n",
subtree_size, anchor_samples.size(), subtree_filename.c_str());
} else {
fprintf(stderr,
"Writing single subtree with %zu randomly added leaves to file %s.\n",
subtree_size, subtree_filename.c_str());
}

std::ofstream subtree_file(subtree_filename.c_str(), std::ofstream::out);
std::stringstream newick_ss;
Expand Down Expand Up @@ -580,7 +588,7 @@ static int set_num_leaves_as_bfs_idx(Node* root){
root->bfs_index=child_cnt;
return child_cnt;
}
void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotated_Tree::Tree& T, std::vector<Node*> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) {
void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotated_Tree::Tree& T, std::vector<Node*> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector<Node*> anchor_samples) {
fprintf(stderr, "Computing subtrees for %ld samples. \n\n", samples.size());
std::string preid = "/";
if (use_tree_idx) {
Expand Down Expand Up @@ -660,7 +668,7 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotat
Mutation_Annotated_Tree::Node* last_anc = samples[i];
trees_to_extract.emplace_back();
auto& curr_tree=trees_to_extract.back();
curr_tree.leaves_to_keep.reserve(subtree_size);
curr_tree.leaves_to_keep.reserve(subtree_size + anchor_samples.size());
subtree_size=std::min(subtree_size,T.root->bfs_index-1);
// Keep moving up the tree till a subtree of required size is
// found
Expand Down Expand Up @@ -706,6 +714,9 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotat
}
break;
}
// Add "anchor samples" (if any)
curr_tree.leaves_to_keep.insert(curr_tree.leaves_to_keep.end(),
anchor_samples.begin(), anchor_samples.end());
}
tbb::parallel_for(tbb::blocked_range<size_t>(0,trees_to_extract.size()),[&](tbb::blocked_range<size_t> range){
for (int num_subtrees=range.begin(); num_subtrees<(int)range.end(); num_subtrees++) {
Expand Down
8 changes: 5 additions & 3 deletions src/matOptimize/mutation_annotated_tree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ class Mutations_Collection {
}
void push_back(const Mutation& m) {
if (m.get_position()>=(int)(Mutation::refs.size()+1)&&m.get_position()!=INT_MAX) {
fprintf(stderr, "strange size \n");
fprintf(stderr, "strange size %d >= %d\n", m.get_position(), (int)(Mutation::refs.size()+1));
raise(SIGTRAP);
}
if (!mutations.empty()) {
Expand Down Expand Up @@ -658,12 +658,14 @@ void get_random_single_subtree(const Mutation_Annotated_Tree::Tree &T,
std::vector<Node*> samples,
std::string outdir, size_t subtree_size,
size_t tree_idx = 0, bool use_tree_idx = false,
bool retain_original_branch_len = false);
bool retain_original_branch_len = false,
std::vector<Node*> anchor_samples = std::vector<Node*>());
void get_random_sample_subtrees(const Mutation_Annotated_Tree::Tree &T,
std::vector<Node*> samples,
std::string outdir, size_t subtree_size,
size_t tree_idx = 0, bool use_tree_idx = false,
bool retain_original_branch_len = false);
bool retain_original_branch_len = false,
std::vector<Node*> anchor_samples = std::vector<Node*>());
bool load_mutation_annotated_tree (std::string filename,Tree& tree);
void save_mutation_annotated_tree (const Tree& tree, std::string filename);
void get_sample_mutation_paths (Mutation_Annotated_Tree::Tree* T, std::vector<Node*> samples, std::string mutation_paths_filename);
Expand Down
20 changes: 18 additions & 2 deletions src/matUtils/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) {
"Use to produce an usher-style minimum set of subtrees of the indicated size which include all of the selected samples. Produces .nh and .txt files.")
("usher-clades-txt", po::bool_switch(),
"When producing usher-style subtree(s), also write an usher-style clades.txt file with clade annotations for selected samples, if the tree has clade annotations.")
("usher-anchor-samples", po::value<std::string>()->default_value(""),
"Add samples from file to usher-style subtree(s) (e.g. to provide larger-scale context by including well-known vaccine strains)")
("add-random,W", po::value<size_t>()->default_value(0),
"Add exactly W samples at random to your selection. Affected by -Z and overridden by -z.")
("select-nearest,Y", po::value<size_t>()->default_value(0),
Expand Down Expand Up @@ -168,6 +170,7 @@ void extract_main (po::parsed_options parsed) {
size_t usher_single_subtree_size = vm["usher-single-subtree-size"].as<size_t>();
size_t usher_minimum_subtrees_size = vm["usher-minimum-subtrees-size"].as<size_t>();
bool usher_clades_txt = vm["usher-clades-txt"].as<bool>();
std::string usher_anchor_samples = vm["usher-anchor-samples"].as<std::string>();
size_t setsize = vm["set-size"].as<size_t>();
size_t minimum_subtrees_size = vm["minimum-subtrees-size"].as<size_t>();
bool limit_lca = vm["limit-to-lca"].as<bool>();
Expand Down Expand Up @@ -494,11 +497,24 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
//if usher-style subtree output is requested,
//produce that.

std::vector<std::string> anchor_samples;
if (usher_anchor_samples != "") {
if (usher_minimum_subtrees_size > 0 || usher_single_subtree_size) {
anchor_samples = read_sample_names(usher_anchor_samples);
if (anchor_samples.size() == 0) {
fprintf(stderr, "ERROR: --usher-anchor-samples file is empty or unparseable!");
exit(1);
}
} else {
fprintf(stderr, "ERROR: --usher-anchor-samples may be used only together with --usher-minimum-subtrees-size/-x and/or --usher-single-subtree-size/-X\n");
exit(1);
}
}
if (usher_minimum_subtrees_size > 0) {
timer.Start();
fprintf(stderr, "Random minimum sample subtrees of size %ld requested.\n", usher_minimum_subtrees_size);
if (samples.size() > 0) {
MAT::get_random_sample_subtrees(&T, samples, dir_prefix, usher_minimum_subtrees_size, 0, false, retain_branch);
MAT::get_random_sample_subtrees(&T, samples, dir_prefix, usher_minimum_subtrees_size, 0, false, retain_branch, anchor_samples);
} else {
fprintf(stderr, "ERROR: Minimum sample subtree output requested with no valid samples! Check selection parameters\n");
exit(1);
Expand All @@ -509,7 +525,7 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) {
timer.Start();
fprintf(stderr, "Random single encompassing subtree of size %ld requested.\n", usher_single_subtree_size);
if (samples.size() > 0) {
MAT::get_random_single_subtree(&T, samples, dir_prefix, usher_single_subtree_size, 0, false, retain_branch);
MAT::get_random_single_subtree(&T, samples, dir_prefix, usher_single_subtree_size, 0, false, retain_branch, anchor_samples);
} else {
fprintf(stderr, "ERROR: Encompassing subtree output requested with no valid samples! Check selection parameters\n");
exit(1);
Expand Down
17 changes: 13 additions & 4 deletions src/mutation_annotated_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,7 @@ void Mutation_Annotated_Tree::clear_tree(Mutation_Annotated_Tree::Tree& T) {
}
}

void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) {
void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector<std::string> anchor_samples) {
//timer.Start();
std::string preid = "/";
if (use_tree_idx) {
Expand All @@ -1716,15 +1716,21 @@ void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree
leaves_to_keep.emplace_back(l->identifier);
}

// Add "anchor samples" (if any)
leaves_to_keep.insert(leaves_to_keep.end(), anchor_samples.begin(), anchor_samples.end());

auto new_T = Mutation_Annotated_Tree::get_subtree(*T, leaves_to_keep);

// Rotate tree for display
new_T.rotate_for_display();

// Write subtree to file
auto subtree_filename = outdir + preid + "single-subtree.nh";
fprintf(stderr, "Writing single subtree with %zu randomly added leaves to file %s.\n", subtree_size, subtree_filename.c_str());

if (anchor_samples.size() > 0) {
fprintf(stderr, "Writing single subtree with %zu randomly added leaves and %zu anchor samples to file %s.\n", subtree_size, anchor_samples.size(), subtree_filename.c_str());
} else {
fprintf(stderr, "Writing single subtree with %zu randomly added leaves to file %s.\n", subtree_size, subtree_filename.c_str());
}
std::ofstream subtree_file(subtree_filename.c_str(), std::ofstream::out);
std::stringstream newick_ss;
write_newick_string(newick_ss, new_T, new_T.root, true, true, retain_original_branch_len);
Expand Down Expand Up @@ -1776,7 +1782,7 @@ void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree
}
}

void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) {
void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector<std::string> anchor_samples) {
fprintf(stderr, "Computing subtrees for %ld samples. \n\n", samples.size());
std::string preid = "/";
if (use_tree_idx) {
Expand Down Expand Up @@ -1903,6 +1909,9 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tre
}
}

// Add "anchor samples" (if any)
leaves_to_keep.insert(leaves_to_keep.end(), anchor_samples.begin(), anchor_samples.end());

auto new_T = Mutation_Annotated_Tree::get_subtree(*T, leaves_to_keep);

// Rotate tree for display
Expand Down
4 changes: 2 additions & 2 deletions src/mutation_annotated_tree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,8 @@ Tree get_tree_copy(const Tree& tree, const std::string& identifier="");

Node* LCA (const Tree& tree, const std::string& node_id1, const std::string& node_id2);
Tree get_subtree (const Tree& tree, const std::vector<std::string>& samples, bool keep_clade_annotations=false);
void get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false);
void get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false);
void get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false, std::vector<std::string> anchor_samples = std::vector<std::string>());
void get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false, std::vector<std::string> anchor_samples = std::vector<std::string>());
void get_sample_mutation_paths (Mutation_Annotated_Tree::Tree* T, std::vector<std::string> samples, std::string mutation_paths_filename);
void clear_tree(Tree& tree);

Expand Down
2 changes: 2 additions & 0 deletions src/usher-sampled/driver/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ int main(int argc, char **argv) {
"Write minimum set of subtrees covering the newly added samples of size equal to this value")
("write-single-subtree,K", po::value<size_t>(&options.out_options.print_subtrees_single)->default_value(0), \
"Similar to write-subtrees-size but produces a single subtree with all newly added samples along with random samples up to the value specified by this argument")
("anchor-samples", po::value<std::string>(&options.out_options.anchor_samples_file),
"Add samples from file to subtree(s) generated by --write-subtrees-size and/or --write-single-subtree (e.g. to provide larger-scale context by including well-known vaccine strains)")
("multiple-placements,M", po::value(&options.keep_n_tree)->default_value(1), \
"Create a new tree up to this limit for each possibility of parsimony-optimal placement")
("retain-input-branch-lengths,l", po::bool_switch(&options.out_options.retain_original_branch_len)->default_value(false), \
Expand Down
23 changes: 8 additions & 15 deletions src/usher-sampled/driver/socket.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,8 @@ std::string get_options(FILE *f, Leader_Thread_Options &options,std::string& ext
"Input VCF file (in uncompressed or gzip-compressed .gz format)")
("existing_samples", po::value<std::string>(&extract_from_existing),
"extract from existing samples")
("anchor_samples", po::value<std::string>(&options.out_options.anchor_samples_file),
"add samples from file to generated subtree(s)")
(
"outdir,d",
po::value<std::string>(&options.out_options.outdir)->default_value("."),
Expand Down Expand Up @@ -400,19 +402,10 @@ static void child_proc(int fd, TreeCollectionPtr &trees_ptr) {
tbb::task_scheduler_init init(num_threads);
if (existing_samples != "") {
MAT::Tree &tree = iter->second->expanded_tree;
std::fstream sample_file(existing_samples,std::ios_base::in);
std::string sample_name;
std::vector<MAT::Node *> nodes_to_extract;
while (sample_file) {
std::getline(sample_file, sample_name);
if (sample_name != "") {
auto node = tree.get_node(sample_name);
if (!node) {
fprintf(f, "node %s do not exist\n", sample_name.c_str());
} else {
nodes_to_extract.push_back(node);
}
}
std::vector<MAT::Node *> nodes_to_extract = read_sample_nodes(existing_samples, tree, f);
std::vector<MAT::Node *> anchor_sample_nodes;
if (options.out_options.anchor_samples_file != "") {
anchor_sample_nodes = read_sample_nodes(options.out_options.anchor_samples_file, tree, f);
}
if (options.out_options.detailed_clades) {
int num_annotation=tree.get_num_annotations();
Expand Down Expand Up @@ -441,14 +434,14 @@ static void child_proc(int fd, TreeCollectionPtr &trees_ptr) {
options.out_options.print_subtrees_single);
MAT::get_random_single_subtree(
tree, nodes_to_extract, options.out_options.outdir, options.out_options.print_subtrees_single,
0, false, options.out_options.retain_original_branch_len);
0, false, options.out_options.retain_original_branch_len, anchor_sample_nodes);
}
// check_leaves(T);
if ((options.out_options.print_subtrees_size > 1)) {
fprintf(stderr, "Computing subtrees for added samples. \n\n");
MAT::get_random_sample_subtrees(
tree, nodes_to_extract, options.out_options.outdir, options.out_options.print_subtrees_size, 0,
false, options.out_options.retain_original_branch_len);
false, options.out_options.retain_original_branch_len, anchor_sample_nodes);
}

} else {
Expand Down
6 changes: 5 additions & 1 deletion src/usher-sampled/usher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ void Sample_Input(const char *name, std::vector<Sample_Muts> &sample_mutations,
Mutation_Set get_mutations(const MAT::Node *main_tree_node);
void check_descendant_nuc(const MAT::Node* node);
#endif

std::vector<MAT::Node*>read_sample_nodes(std::string samples_file, MAT::Tree &T, FILE *f);

struct output_options {
bool print_uncondensed_tree;
std::string outdir;
Expand All @@ -92,6 +95,7 @@ struct output_options {
bool detailed_clades;
bool redo_FS_Min_Back_Mutations;
bool suppress_whole_newick;
std::string anchor_samples_file;
};
bool final_output(MAT::Tree& T,const output_options& options,int t_idx,std::vector<Clade_info>& assigned_clades,
size_t sample_start_idx,size_t sample_end_idx,std::vector<std::string>& low_confidence_samples,
Expand Down Expand Up @@ -174,4 +178,4 @@ int prep_tree(MAT::Tree &tree);
void load_diff_for_usher(
const char *input_path,std::vector<Sample_Muts>& all_samples,
std::vector<mutated_t>& position_wise_out, MAT::Tree &tree,
const std::string& fasta_fname,std::vector<std::string> & samples);
const std::string& fasta_fname,std::vector<std::string> & samples);
Loading

0 comments on commit e541c00

Please sign in to comment.