Skip to content

Commit

Permalink
Merge pull request #206 from vgteam/distance-indexing
Browse files Browse the repository at this point in the history
Distance indexing
  • Loading branch information
xchang1 authored Dec 6, 2024
2 parents 14f9d7b + 9014aa6 commit 2116643
Show file tree
Hide file tree
Showing 2 changed files with 324 additions and 133 deletions.
97 changes: 60 additions & 37 deletions bdsg/include/bdsg/snarl_distance_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,11 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
ROOT_SNARL, DISTANCED_ROOT_SNARL,
CHAIN, DISTANCED_CHAIN, MULTICOMPONENT_CHAIN,
CHILDREN};
const static bool has_distances(record_t type) {
return type == DISTANCED_NODE || type == DISTANCED_TRIVIAL_SNARL || type == DISTANCED_SIMPLE_SNARL
|| type == DISTANCED_SNARL || type == OVERSIZED_SNARL || type == DISTANCED_ROOT_SNARL
|| type == DISTANCED_CHAIN || type == MULTICOMPONENT_CHAIN;
}



Expand Down Expand Up @@ -816,14 +821,16 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
* [trivial snarl tag, pointer to parent, node count, prefix sum, fd loop, rev loop, component]
* The record is followed by [node id+orientation, right prefix sum] for each node in the trivial snarl
* So the total length of the trivial snarl is 8+2*#nodes
* So the total length of the distanced trivial snarl is 8+2*#nodes, and the length of a distanceless
* trivial snarl is 8+#nodes
* The right prefix sum is the sum from the start of the trivial chain to the right side of the node (relative to the chain)
* The node_record_offset in a net_handle_t to a trivial snarl points to a node in the trivial snarl
*/
const static size_t BITS_FOR_TRIVIAL_NODE_OFFSET = 8;
const static size_t MAX_TRIVIAL_SNARL_NODE_COUNT = (1 << BITS_FOR_TRIVIAL_NODE_OFFSET) -1;
const static size_t TRIVIAL_SNARL_RECORD_SIZE = 8;
const static size_t DISTANCED_TRIVIAL_SNARL_RECORD_SIZE = 8;
const static size_t DISTANCELESS_TRIVIAL_SNARL_RECORD_SIZE = 3;
const static size_t TRIVIAL_SNARL_PARENT_OFFSET = 1;
const static size_t TRIVIAL_SNARL_NODE_COUNT_OFFSET = 2;
const static size_t TRIVIAL_SNARL_PREFIX_SUM_OFFSET = 3;
Expand Down Expand Up @@ -888,12 +895,18 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
* that include boundary nodes (OVERSIZED_SNARL)
*/
size_t snarl_size_limit = 5000;

//If this is true, then only store distance along top-level chains. Everything still needs its minimum lengths to get
//the distances along top-level chains but don't store internal distances in snarls or in nested chains
//This overrides snarl_size_limit
bool only_top_level_chain_distances=false;
static const int max_num_size_limit_warnings = 100;
std::atomic<int> size_limit_warnings{0};
static const uint32_t magic_number = 1738636486;

public:
void set_snarl_size_limit (size_t size) {snarl_size_limit=size;}
void set_only_top_level_chain_distances (bool only_chain) {only_top_level_chain_distances=only_chain;}



Expand Down Expand Up @@ -1139,7 +1152,9 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
bool get_is_reversed_in_parent(size_t node_rank) const; //is the node_rank-th node reversed

size_t get_record_size() {
return TRIVIAL_SNARL_RECORD_SIZE + (get_node_count() * 2);
return get_record_type() == DISTANCED_TRIVIAL_SNARL
? DISTANCED_TRIVIAL_SNARL_RECORD_SIZE + (get_node_count() * 2)
: DISTANCELESS_TRIVIAL_SNARL_RECORD_SIZE + get_node_count();
}
TrivialSnarlRecord (size_t offset, const bdsg::yomo::UniqueMappedPointer<bdsg::MappedIntVector>* tree_records);
};
Expand Down Expand Up @@ -1243,7 +1258,7 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
SimpleSnarlRecord (net_handle_t net, const bdsg::yomo::UniqueMappedPointer<bdsg::MappedIntVector>* tree_records);

//How big is the entire snarl record?
const static size_t record_size(size_t node_count) {return SIMPLE_SNARL_RECORD_SIZE + (node_count*2);}
const static size_t record_size(size_t node_count, bool include_distances) {return SIMPLE_SNARL_RECORD_SIZE + (node_count*2);}
size_t record_size() ;

//Get and set the distances between two node sides in the graph
Expand Down Expand Up @@ -1390,7 +1405,7 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
//If new_record is true, make a new trivial snarl record for the node
size_t add_node(nid_t node_id, size_t node_length, bool is_reversed_in_parent,
size_t prefix_sum, size_t forward_loop, size_t reverse_loop, size_t component,
size_t max_prefix_sum, size_t previous_child_offset, bool new_record);
size_t max_prefix_sum, size_t previous_child_offset, bool new_record, bool include_distances);

};

Expand Down Expand Up @@ -1484,66 +1499,73 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
};
struct TemporaryChainRecord : TemporaryRecord {
handlegraph::nid_t start_node_id;
bool start_node_rev;
handlegraph::nid_t end_node_id;
bool end_node_rev;
size_t end_node_length;
size_t tree_depth = 0;
size_t end_node_length=0;
size_t tree_depth=0; //TODO: This isn't used but I left it because I couldn't get the python bindings to build when I changed it
//Type of the parent and offset into the appropriate vector
//(TEMP_ROOT, 0) if this is a root level chain
pair<temp_record_t, size_t> parent;
size_t min_length;//Including boundary nodes
size_t min_length=0;//Including boundary nodes
size_t max_length = 0;
vector<pair<temp_record_t, size_t>> children; //All children, both nodes and snarls, in order
//Distances for the chain, one entry per node
vector<size_t> prefix_sum;
vector<size_t> max_prefix_sum;
vector<size_t> forward_loops;
vector<size_t> backward_loops;
vector<size_t> chain_components;//Which component does each node belong to, usually all 0s

//Distances from the left/right of the node to the start/end of the parent snarl
size_t distance_left_start = std::numeric_limits<size_t>::max();
size_t distance_right_start = std::numeric_limits<size_t>::max();
size_t distance_left_end = std::numeric_limits<size_t>::max();
size_t distance_right_end = std::numeric_limits<size_t>::max();

size_t rank_in_parent;
size_t rank_in_parent=0;

//What is the index of this record in root_snarl_components
size_t root_snarl_index = std::numeric_limits<size_t>::max();

bool start_node_rev;
bool end_node_rev;
bool reversed_in_parent;
bool is_trivial;
bool is_tip = false;
//What is the index of this record in root_snarl_components
size_t root_snarl_index = std::numeric_limits<size_t>::max();
bool loopable = true; //If this is a looping snarl, this is false if the last snarl is not start-end connected
size_t get_max_record_length() const;

vector<pair<temp_record_t, size_t>> children; //All children, both nodes and snarls, in order
//Distances for the chain, one entry per node
//TODO This would probably be more efficient as a vector of a struct of five ints
vector<size_t> prefix_sum;
vector<size_t> max_prefix_sum;
vector<size_t> forward_loops;
vector<size_t> backward_loops;
vector<size_t> chain_components;//Which component does each node belong to, usually all 0s

size_t get_max_record_length(bool include_distances) const;
};
struct TemporarySnarlRecord : TemporaryRecord{
pair<temp_record_t, size_t> parent;
handlegraph::nid_t start_node_id;
bool start_node_rev;
size_t start_node_length;
size_t start_node_length=0;
handlegraph::nid_t end_node_id;
bool end_node_rev;
size_t end_node_length;
size_t node_count;
size_t end_node_length=0;
size_t node_count=0;
size_t min_length = std::numeric_limits<size_t>::max(); //Not including boundary nodes
size_t max_length = 0;
size_t max_distance = 0;
size_t tree_depth = 0;
pair<temp_record_t, size_t> parent;
vector<pair<temp_record_t, size_t>> children; //All children, nodes and chains, in arbitrary order
unordered_set<size_t> tippy_child_ranks; //The ranks of children that are tips
//vector<tuple<pair<size_t, bool>, pair<size_t, bool>, size_t>> distances;
unordered_map<pair<pair<size_t, bool>, pair<size_t, bool>>, size_t> distances;
size_t tree_depth = 0; //TODO: This isn't used but I left it because I couldn't get the python bindings to build when I changed it

size_t distance_start_start = std::numeric_limits<size_t>::max();
size_t distance_end_end = std::numeric_limits<size_t>::max();

size_t rank_in_parent;
size_t rank_in_parent=0;

bool reversed_in_parent;
bool start_node_rev;
bool end_node_rev;
bool is_trivial;
bool is_simple;
bool is_tip = false;
bool is_root_snarl = false;
bool include_distances = true;
vector<pair<temp_record_t, size_t>> children; //All children, nodes and chains, in arbitrary order
unordered_set<size_t> tippy_child_ranks; //The ranks of children that are tips
//vector<tuple<pair<size_t, bool>, pair<size_t, bool>, size_t>> distances;
unordered_map<pair<pair<size_t, bool>, pair<size_t, bool>>, size_t> distances;

size_t get_max_record_length() const ;
};
Expand All @@ -1554,17 +1576,18 @@ class SnarlDistanceIndex : public SnarlDecomposition, public TriviallySerializab
}
handlegraph::nid_t node_id;
pair<temp_record_t, size_t> parent;
size_t node_length;
size_t rank_in_parent;
bool reversed_in_parent;
bool is_tip = false;
size_t node_length=0;
size_t rank_in_parent=0;
size_t root_snarl_index = std::numeric_limits<size_t>::max();
//Distances from the left/right of the node to the start/end of the parent snarl
size_t distance_left_start = std::numeric_limits<size_t>::max();
size_t distance_right_start = std::numeric_limits<size_t>::max();
size_t distance_left_end = std::numeric_limits<size_t>::max();
size_t distance_right_end = std::numeric_limits<size_t>::max();

bool reversed_in_parent;
bool is_tip = false;


const static size_t get_max_record_length() {
return NODE_RECORD_SIZE;}
Expand Down
Loading

0 comments on commit 2116643

Please sign in to comment.