From e541c00401738b89e01cc681422fed776b9f7f53 Mon Sep 17 00:00:00 2001 From: Angie Hinrichs Date: Tue, 21 May 2024 16:35:28 -0700 Subject: [PATCH] Add --anchor-samples option to add specific sequences to subtrees Add option --anchor-samples to usher-sampled and usher-sampled-server, and corresponding option --usher-anchor-samples to matUtils, to specify a file containing sample names that should be added to subtrees after the usual random selection. This was a feature request from influenza collaborators, so that vaccine strains could be added to subtrees to give the larger-scale context of familiar sequences. --- src/matOptimize/mutation_annotated_tree.cpp | 25 +++++++++++++------ src/matOptimize/mutation_annotated_tree.hpp | 8 +++--- src/matUtils/extract.cpp | 20 +++++++++++++-- src/mutation_annotated_tree.cpp | 17 ++++++++++--- src/mutation_annotated_tree.hpp | 4 +-- src/usher-sampled/driver/main.cpp | 2 ++ src/usher-sampled/driver/socket.cpp | 23 ++++++------------ src/usher-sampled/usher.hpp | 6 ++++- src/usher-sampled/utils.cpp | 27 +++++++++++++++++++-- 9 files changed, 96 insertions(+), 36 deletions(-) diff --git a/src/matOptimize/mutation_annotated_tree.cpp b/src/matOptimize/mutation_annotated_tree.cpp index 0f20e189..33d47d1b 100644 --- a/src/matOptimize/mutation_annotated_tree.cpp +++ b/src/matOptimize/mutation_annotated_tree.cpp @@ -418,7 +418,7 @@ void Mutation_Annotated_Tree::Tree::rotate_for_display(bool reverse) { void Mutation_Annotated_Tree::get_random_single_subtree( const Mutation_Annotated_Tree::Tree & T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, - bool retain_original_branch_len) { + bool retain_original_branch_len, std::vector anchor_samples) { // timer.Start(); std::string preid = "/"; if (use_tree_idx) { @@ -436,6 +436,9 @@ void Mutation_Annotated_Tree::get_random_single_subtree( } } + // Add "anchor samples" (if any) + leaves_to_keep_set.insert(anchor_samples.begin(), anchor_samples.end()); + std::vector leaves_to_keep(leaves_to_keep_set.begin(), leaves_to_keep_set.end()); auto new_T = get_subtree(T, leaves_to_keep); @@ -445,10 +448,15 @@ void Mutation_Annotated_Tree::get_random_single_subtree( // Write subtree to file auto subtree_filename = outdir + preid + "single-subtree.nh"; - fprintf( - stderr, - "Writing single subtree with %zu randomly added leaves to file %s.\n", - subtree_size, subtree_filename.c_str()); + if (anchor_samples.size() > 0) { + fprintf(stderr, + "Writing single subtree with %zu randomly added leaves and %zu anchor samples to file %s.\n", + subtree_size, anchor_samples.size(), subtree_filename.c_str()); + } else { + fprintf(stderr, + "Writing single subtree with %zu randomly added leaves to file %s.\n", + subtree_size, subtree_filename.c_str()); + } std::ofstream subtree_file(subtree_filename.c_str(), std::ofstream::out); std::stringstream newick_ss; @@ -580,7 +588,7 @@ static int set_num_leaves_as_bfs_idx(Node* root){ root->bfs_index=child_cnt; return child_cnt; } -void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotated_Tree::Tree& T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) { +void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotated_Tree::Tree& T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector anchor_samples) { fprintf(stderr, "Computing subtrees for %ld samples. \n\n", samples.size()); std::string preid = "/"; if (use_tree_idx) { @@ -660,7 +668,7 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotat Mutation_Annotated_Tree::Node* last_anc = samples[i]; trees_to_extract.emplace_back(); auto& curr_tree=trees_to_extract.back(); - curr_tree.leaves_to_keep.reserve(subtree_size); + curr_tree.leaves_to_keep.reserve(subtree_size + anchor_samples.size()); subtree_size=std::min(subtree_size,T.root->bfs_index-1); // Keep moving up the tree till a subtree of required size is // found @@ -706,6 +714,9 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (const Mutation_Annotat } break; } + // Add "anchor samples" (if any) + curr_tree.leaves_to_keep.insert(curr_tree.leaves_to_keep.end(), + anchor_samples.begin(), anchor_samples.end()); } tbb::parallel_for(tbb::blocked_range(0,trees_to_extract.size()),[&](tbb::blocked_range range){ for (int num_subtrees=range.begin(); num_subtrees<(int)range.end(); num_subtrees++) { diff --git a/src/matOptimize/mutation_annotated_tree.hpp b/src/matOptimize/mutation_annotated_tree.hpp index 7d7c127a..2c302ec8 100644 --- a/src/matOptimize/mutation_annotated_tree.hpp +++ b/src/matOptimize/mutation_annotated_tree.hpp @@ -302,7 +302,7 @@ class Mutations_Collection { } void push_back(const Mutation& m) { if (m.get_position()>=(int)(Mutation::refs.size()+1)&&m.get_position()!=INT_MAX) { - fprintf(stderr, "strange size \n"); + fprintf(stderr, "strange size %d >= %d\n", m.get_position(), (int)(Mutation::refs.size()+1)); raise(SIGTRAP); } if (!mutations.empty()) { @@ -658,12 +658,14 @@ void get_random_single_subtree(const Mutation_Annotated_Tree::Tree &T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, - bool retain_original_branch_len = false); + bool retain_original_branch_len = false, + std::vector anchor_samples = std::vector()); void get_random_sample_subtrees(const Mutation_Annotated_Tree::Tree &T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, - bool retain_original_branch_len = false); + bool retain_original_branch_len = false, + std::vector anchor_samples = std::vector()); bool load_mutation_annotated_tree (std::string filename,Tree& tree); void save_mutation_annotated_tree (const Tree& tree, std::string filename); void get_sample_mutation_paths (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string mutation_paths_filename); diff --git a/src/matUtils/extract.cpp b/src/matUtils/extract.cpp index 156e01c1..0d9f3986 100644 --- a/src/matUtils/extract.cpp +++ b/src/matUtils/extract.cpp @@ -97,6 +97,8 @@ po::variables_map parse_extract_command(po::parsed_options parsed) { "Use to produce an usher-style minimum set of subtrees of the indicated size which include all of the selected samples. Produces .nh and .txt files.") ("usher-clades-txt", po::bool_switch(), "When producing usher-style subtree(s), also write an usher-style clades.txt file with clade annotations for selected samples, if the tree has clade annotations.") + ("usher-anchor-samples", po::value()->default_value(""), + "Add samples from file to usher-style subtree(s) (e.g. to provide larger-scale context by including well-known vaccine strains)") ("add-random,W", po::value()->default_value(0), "Add exactly W samples at random to your selection. Affected by -Z and overridden by -z.") ("select-nearest,Y", po::value()->default_value(0), @@ -168,6 +170,7 @@ void extract_main (po::parsed_options parsed) { size_t usher_single_subtree_size = vm["usher-single-subtree-size"].as(); size_t usher_minimum_subtrees_size = vm["usher-minimum-subtrees-size"].as(); bool usher_clades_txt = vm["usher-clades-txt"].as(); + std::string usher_anchor_samples = vm["usher-anchor-samples"].as(); size_t setsize = vm["set-size"].as(); size_t minimum_subtrees_size = vm["minimum-subtrees-size"].as(); bool limit_lca = vm["limit-to-lca"].as(); @@ -494,11 +497,24 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) { //if usher-style subtree output is requested, //produce that. + std::vector anchor_samples; + if (usher_anchor_samples != "") { + if (usher_minimum_subtrees_size > 0 || usher_single_subtree_size) { + anchor_samples = read_sample_names(usher_anchor_samples); + if (anchor_samples.size() == 0) { + fprintf(stderr, "ERROR: --usher-anchor-samples file is empty or unparseable!"); + exit(1); + } + } else { + fprintf(stderr, "ERROR: --usher-anchor-samples may be used only together with --usher-minimum-subtrees-size/-x and/or --usher-single-subtree-size/-X\n"); + exit(1); + } + } if (usher_minimum_subtrees_size > 0) { timer.Start(); fprintf(stderr, "Random minimum sample subtrees of size %ld requested.\n", usher_minimum_subtrees_size); if (samples.size() > 0) { - MAT::get_random_sample_subtrees(&T, samples, dir_prefix, usher_minimum_subtrees_size, 0, false, retain_branch); + MAT::get_random_sample_subtrees(&T, samples, dir_prefix, usher_minimum_subtrees_size, 0, false, retain_branch, anchor_samples); } else { fprintf(stderr, "ERROR: Minimum sample subtree output requested with no valid samples! Check selection parameters\n"); exit(1); @@ -509,7 +525,7 @@ usher_single_subtree_size == 0 && usher_minimum_subtrees_size == 0) { timer.Start(); fprintf(stderr, "Random single encompassing subtree of size %ld requested.\n", usher_single_subtree_size); if (samples.size() > 0) { - MAT::get_random_single_subtree(&T, samples, dir_prefix, usher_single_subtree_size, 0, false, retain_branch); + MAT::get_random_single_subtree(&T, samples, dir_prefix, usher_single_subtree_size, 0, false, retain_branch, anchor_samples); } else { fprintf(stderr, "ERROR: Encompassing subtree output requested with no valid samples! Check selection parameters\n"); exit(1); diff --git a/src/mutation_annotated_tree.cpp b/src/mutation_annotated_tree.cpp index 09a3e005..a27d3e76 100644 --- a/src/mutation_annotated_tree.cpp +++ b/src/mutation_annotated_tree.cpp @@ -1690,7 +1690,7 @@ void Mutation_Annotated_Tree::clear_tree(Mutation_Annotated_Tree::Tree& T) { } } -void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) { +void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector anchor_samples) { //timer.Start(); std::string preid = "/"; if (use_tree_idx) { @@ -1716,6 +1716,9 @@ void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree leaves_to_keep.emplace_back(l->identifier); } + // Add "anchor samples" (if any) + leaves_to_keep.insert(leaves_to_keep.end(), anchor_samples.begin(), anchor_samples.end()); + auto new_T = Mutation_Annotated_Tree::get_subtree(*T, leaves_to_keep); // Rotate tree for display @@ -1723,8 +1726,11 @@ void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree // Write subtree to file auto subtree_filename = outdir + preid + "single-subtree.nh"; - fprintf(stderr, "Writing single subtree with %zu randomly added leaves to file %s.\n", subtree_size, subtree_filename.c_str()); - + if (anchor_samples.size() > 0) { + fprintf(stderr, "Writing single subtree with %zu randomly added leaves and %zu anchor samples to file %s.\n", subtree_size, anchor_samples.size(), subtree_filename.c_str()); + } else { + fprintf(stderr, "Writing single subtree with %zu randomly added leaves to file %s.\n", subtree_size, subtree_filename.c_str()); + } std::ofstream subtree_file(subtree_filename.c_str(), std::ofstream::out); std::stringstream newick_ss; write_newick_string(newick_ss, new_T, new_T.root, true, true, retain_original_branch_len); @@ -1776,7 +1782,7 @@ void Mutation_Annotated_Tree::get_random_single_subtree (Mutation_Annotated_Tree } } -void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len) { +void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx, bool use_tree_idx, bool retain_original_branch_len, std::vector anchor_samples) { fprintf(stderr, "Computing subtrees for %ld samples. \n\n", samples.size()); std::string preid = "/"; if (use_tree_idx) { @@ -1903,6 +1909,9 @@ void Mutation_Annotated_Tree::get_random_sample_subtrees (Mutation_Annotated_Tre } } + // Add "anchor samples" (if any) + leaves_to_keep.insert(leaves_to_keep.end(), anchor_samples.begin(), anchor_samples.end()); + auto new_T = Mutation_Annotated_Tree::get_subtree(*T, leaves_to_keep); // Rotate tree for display diff --git a/src/mutation_annotated_tree.hpp b/src/mutation_annotated_tree.hpp index bfc2d4ef..fe5f1a20 100644 --- a/src/mutation_annotated_tree.hpp +++ b/src/mutation_annotated_tree.hpp @@ -175,8 +175,8 @@ Tree get_tree_copy(const Tree& tree, const std::string& identifier=""); Node* LCA (const Tree& tree, const std::string& node_id1, const std::string& node_id2); Tree get_subtree (const Tree& tree, const std::vector& samples, bool keep_clade_annotations=false); -void get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false); -void get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false); +void get_random_single_subtree (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false, std::vector anchor_samples = std::vector()); +void get_random_sample_subtrees (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string outdir, size_t subtree_size, size_t tree_idx = 0, bool use_tree_idx = false, bool retain_original_branch_len = false, std::vector anchor_samples = std::vector()); void get_sample_mutation_paths (Mutation_Annotated_Tree::Tree* T, std::vector samples, std::string mutation_paths_filename); void clear_tree(Tree& tree); diff --git a/src/usher-sampled/driver/main.cpp b/src/usher-sampled/driver/main.cpp index b631b522..20ee6e4d 100644 --- a/src/usher-sampled/driver/main.cpp +++ b/src/usher-sampled/driver/main.cpp @@ -430,6 +430,8 @@ int main(int argc, char **argv) { "Write minimum set of subtrees covering the newly added samples of size equal to this value") ("write-single-subtree,K", po::value(&options.out_options.print_subtrees_single)->default_value(0), \ "Similar to write-subtrees-size but produces a single subtree with all newly added samples along with random samples up to the value specified by this argument") + ("anchor-samples", po::value(&options.out_options.anchor_samples_file), + "Add samples from file to subtree(s) generated by --write-subtrees-size and/or --write-single-subtree (e.g. to provide larger-scale context by including well-known vaccine strains)") ("multiple-placements,M", po::value(&options.keep_n_tree)->default_value(1), \ "Create a new tree up to this limit for each possibility of parsimony-optimal placement") ("retain-input-branch-lengths,l", po::bool_switch(&options.out_options.retain_original_branch_len)->default_value(false), \ diff --git a/src/usher-sampled/driver/socket.cpp b/src/usher-sampled/driver/socket.cpp index bd3b7552..01b00170 100644 --- a/src/usher-sampled/driver/socket.cpp +++ b/src/usher-sampled/driver/socket.cpp @@ -281,6 +281,8 @@ std::string get_options(FILE *f, Leader_Thread_Options &options,std::string& ext "Input VCF file (in uncompressed or gzip-compressed .gz format)") ("existing_samples", po::value(&extract_from_existing), "extract from existing samples") + ("anchor_samples", po::value(&options.out_options.anchor_samples_file), + "add samples from file to generated subtree(s)") ( "outdir,d", po::value(&options.out_options.outdir)->default_value("."), @@ -400,19 +402,10 @@ static void child_proc(int fd, TreeCollectionPtr &trees_ptr) { tbb::task_scheduler_init init(num_threads); if (existing_samples != "") { MAT::Tree &tree = iter->second->expanded_tree; - std::fstream sample_file(existing_samples,std::ios_base::in); - std::string sample_name; - std::vector nodes_to_extract; - while (sample_file) { - std::getline(sample_file, sample_name); - if (sample_name != "") { - auto node = tree.get_node(sample_name); - if (!node) { - fprintf(f, "node %s do not exist\n", sample_name.c_str()); - } else { - nodes_to_extract.push_back(node); - } - } + std::vector nodes_to_extract = read_sample_nodes(existing_samples, tree, f); + std::vector anchor_sample_nodes; + if (options.out_options.anchor_samples_file != "") { + anchor_sample_nodes = read_sample_nodes(options.out_options.anchor_samples_file, tree, f); } if (options.out_options.detailed_clades) { int num_annotation=tree.get_num_annotations(); @@ -441,14 +434,14 @@ static void child_proc(int fd, TreeCollectionPtr &trees_ptr) { options.out_options.print_subtrees_single); MAT::get_random_single_subtree( tree, nodes_to_extract, options.out_options.outdir, options.out_options.print_subtrees_single, - 0, false, options.out_options.retain_original_branch_len); + 0, false, options.out_options.retain_original_branch_len, anchor_sample_nodes); } // check_leaves(T); if ((options.out_options.print_subtrees_size > 1)) { fprintf(stderr, "Computing subtrees for added samples. \n\n"); MAT::get_random_sample_subtrees( tree, nodes_to_extract, options.out_options.outdir, options.out_options.print_subtrees_size, 0, - false, options.out_options.retain_original_branch_len); + false, options.out_options.retain_original_branch_len, anchor_sample_nodes); } } else { diff --git a/src/usher-sampled/usher.hpp b/src/usher-sampled/usher.hpp index 98f858f0..c0c151a1 100644 --- a/src/usher-sampled/usher.hpp +++ b/src/usher-sampled/usher.hpp @@ -81,6 +81,9 @@ void Sample_Input(const char *name, std::vector &sample_mutations, Mutation_Set get_mutations(const MAT::Node *main_tree_node); void check_descendant_nuc(const MAT::Node* node); #endif + +std::vectorread_sample_nodes(std::string samples_file, MAT::Tree &T, FILE *f); + struct output_options { bool print_uncondensed_tree; std::string outdir; @@ -92,6 +95,7 @@ struct output_options { bool detailed_clades; bool redo_FS_Min_Back_Mutations; bool suppress_whole_newick; + std::string anchor_samples_file; }; bool final_output(MAT::Tree& T,const output_options& options,int t_idx,std::vector& assigned_clades, size_t sample_start_idx,size_t sample_end_idx,std::vector& low_confidence_samples, @@ -174,4 +178,4 @@ int prep_tree(MAT::Tree &tree); void load_diff_for_usher( const char *input_path,std::vector& all_samples, std::vector& position_wise_out, MAT::Tree &tree, - const std::string& fasta_fname,std::vector & samples); \ No newline at end of file + const std::string& fasta_fname,std::vector & samples); diff --git a/src/usher-sampled/utils.cpp b/src/usher-sampled/utils.cpp index c33ff8a6..295a07e5 100644 --- a/src/usher-sampled/utils.cpp +++ b/src/usher-sampled/utils.cpp @@ -618,6 +618,25 @@ void print_annotation(const MAT::Tree &T, const output_options &options, fclose(annotations_file); } + +std::vectorread_sample_nodes(std::string samples_file, MAT::Tree &T, FILE *f) { + std::vector sample_nodes; + std::fstream sample_file(samples_file,std::ios_base::in); + std::string sample_name; + while (sample_file) { + std::getline(sample_file, sample_name); + if (sample_name != "") { + auto node = T.get_node(sample_name); + if (!node) { + fprintf(f, "node %s in file %s does not exist\n", sample_name.c_str(), samples_file.c_str()); + } else { + sample_nodes.push_back(node); + } + } + } + return sample_nodes; +} + bool final_output(MAT::Tree &T, const output_options &options, int t_idx, std::vector &assigned_clades, size_t sample_start_idx, size_t sample_end_idx, @@ -690,13 +709,17 @@ bool final_output(MAT::Tree &T, const output_options &options, int t_idx, // fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } + std::vector anchor_sample_nodes; + if (options.anchor_samples_file != "") { + anchor_sample_nodes = read_sample_nodes(options.anchor_samples_file, T, stderr); + } //check_leaves(T); if ((options.print_subtrees_single > 1) ) { fprintf(stderr, "Computing the single subtree for added samples with %zu random leaves. \n\n", options.print_subtrees_single); //timer.Start(); // For each final tree, write a subtree of user-specified size around // each newly placed sample in newick format - MAT::get_random_single_subtree(T, targets, options.outdir, options.print_subtrees_single, t_idx, use_tree_idx, options.retain_original_branch_len); + MAT::get_random_single_subtree(T, targets, options.outdir, options.print_subtrees_single, t_idx, use_tree_idx, options.retain_original_branch_len, anchor_sample_nodes); //fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); } //check_leaves(T); @@ -706,7 +729,7 @@ bool final_output(MAT::Tree &T, const output_options &options, int t_idx, // For each final tree, write a subtree of user-specified size around // each newly placed sample in newick format //timer.Start(); - MAT::get_random_sample_subtrees(T, targets, options.outdir, options.print_subtrees_size, t_idx, use_tree_idx, options.retain_original_branch_len); + MAT::get_random_sample_subtrees(T, targets, options.outdir, options.print_subtrees_size, t_idx, use_tree_idx, options.retain_original_branch_len, anchor_sample_nodes); //fprintf(stderr, "Completed in %ld msec \n\n", timer.Stop()); }