diff --git a/include/bbp/sonata/report_reader.h b/include/bbp/sonata/report_reader.h index 0fa7c255..3dbe27eb 100644 --- a/include/bbp/sonata/report_reader.h +++ b/include/bbp/sonata/report_reader.h @@ -102,9 +102,6 @@ template class SONATA_API ReportReader { public: - using Range = std::pair; - using Ranges = std::vector; - class Population { public: @@ -144,11 +141,9 @@ class SONATA_API ReportReader * * \param node_ids limit the report to the given selection. If nullptr, all nodes in the * report are used - * \param fn lambda applied to all ranges for all node ids */ typename DataFrame::DataType getNodeIdElementIdMapping( - const nonstd::optional& node_ids = nonstd::nullopt, - std::function fn = nullptr) const; + const nonstd::optional& node_ids = nonstd::nullopt) const; /** * \param node_ids limit the report to the given selection. @@ -163,11 +158,27 @@ class SONATA_API ReportReader const nonstd::optional& tstride = nonstd::nullopt) const; private: + struct NodeIdElementLayout { + typename DataFrame::DataType ids; + Selection::Ranges node_ranges; + Selection::Range min_max_range; + }; + Population(const H5::File& file, const std::string& populationName); std::pair getIndex(const nonstd::optional& tstart, const nonstd::optional& tstop) const; + /** + * Return the element IDs for the given selection, alongside the filtered node pointers + * and the range of positions where they fit in the file. This latter two are necessary + * for performance to understand how and where to retrieve the data from storage. + * + * \param node_ids limit the report to the given selection. If nullptr, all nodes in the + * report are used + */ + NodeIdElementLayout getNodeIdElementLayout( + const nonstd::optional& node_ids = nonstd::nullopt) const; - std::map nodes_pointers_; + std::map node_ranges_; H5::Group pop_group_; std::vector nodes_ids_; double tstart_, tstop_, tstep_; @@ -175,8 +186,6 @@ class SONATA_API ReportReader std::string time_units_; std::string data_units_; bool nodes_ids_sorted_ = false; - Selection::Values node_ids_from_selection( - const nonstd::optional& node_ids = nonstd::nullopt) const; friend ReportReader; }; diff --git a/python/bindings.cpp b/python/bindings.cpp index 40514f9b..1bb30c3e 100644 --- a/python/bindings.cpp +++ b/python/bindings.cpp @@ -368,7 +368,7 @@ void bindReportReader(py::module& m, const std::string& prefix) { "get_node_id_element_id_mapping", [](const typename ReportType::Population& population, const nonstd::optional& selection) { - return population.getNodeIdElementIdMapping(selection, nullptr); + return population.getNodeIdElementIdMapping(selection); }, DOC_REPORTREADER_POP(getNodeIdElementIdMapping), "selection"_a = nonstd::nullopt) diff --git a/src/report_reader.cpp b/src/report_reader.cpp index 3723c7b2..a5834bbe 100644 --- a/src/report_reader.cpp +++ b/src/report_reader.cpp @@ -231,8 +231,8 @@ ReportReader::Population::Population(const H5::File& file, const std::string& mapping_group.getDataSet("index_pointers").read(index_pointers); for (size_t i = 0; i < nodes_ids_.size(); ++i) { - nodes_pointers_.emplace(nodes_ids_[i], - std::make_pair(index_pointers[i], index_pointers[i + 1])); + node_ranges_.emplace(nodes_ids_[i], + std::make_pair(index_pointers[i], index_pointers[i + 1])); } { // Get times @@ -284,24 +284,82 @@ std::vector ReportReader::Population::getNodeIds() const { } template -Selection::Values ReportReader::Population::node_ids_from_selection( - const nonstd::optional& selection) const { - Selection::Values node_ids; - - if (!selection) { // Take all nodes in this case - node_ids.reserve(nodes_pointers_.size()); - std::transform(nodes_pointers_.begin(), - nodes_pointers_.end(), - std::back_inserter(node_ids), - [](const std::pair& node_pointer) { - return node_pointer.first; - }); - } else if (selection->empty()) { - return {}; +typename ReportReader::Population::NodeIdElementLayout +ReportReader::Population::getNodeIdElementLayout( + const nonstd::optional& node_ids) const { + std::vector concrete_node_ids; + NodeIdElementLayout result; + result.min_max_range = {std::numeric_limits::max(), + std::numeric_limits::min()}; + size_t element_ids_count = 0; + + // Helper function to update layout, alongside the min / max values and count + const auto update_layout = [&](const NodeID& node_id, const Selection::Range& range) { + concrete_node_ids.emplace_back(node_id); + result.node_ranges.emplace_back(range); + + result.min_max_range.first = std::min(result.min_max_range.first, range.first); + result.min_max_range.second = std::max(result.min_max_range.second, range.second); + element_ids_count += (range.second - range.first); + }; + + // Take all nodes if no selection is provided + if (!node_ids) { + concrete_node_ids.reserve(node_ranges_.size()); + result.node_ranges.reserve(node_ranges_.size()); + + for (const auto& node_range : node_ranges_) { + update_layout(node_range.first, node_range.second); + } + } else if (!node_ids->empty()) { + const auto selected_node_ids = node_ids->flatten(); + + // Reserve space for all the requested IDs and shrink afterwards + concrete_node_ids.reserve(selected_node_ids.size()); + result.node_ranges.reserve(selected_node_ids.size()); + + for (const auto node_id : selected_node_ids) { + const auto it = node_ranges_.find(node_id); + if (it != node_ranges_.end()) { + update_layout(it->first, it->second); + } + } + + concrete_node_ids.shrink_to_fit(); + result.node_ranges.shrink_to_fit(); } else { - node_ids = selection->flatten(); + // node_ids Selection exists, but is empty } - return node_ids; + + // Extract the ElementIDs from the GIDs + if (!result.node_ranges.empty()) { + const auto min = result.min_max_range.first; + const auto max = result.min_max_range.second; + + std::vector element_ids; + auto dataset_elem_ids = pop_group_.getGroup("mapping").getDataSet("element_ids"); + dataset_elem_ids.select({min}, {max - min}).read(element_ids); + + result.ids.reserve(element_ids_count); + + for (size_t i = 0; i < concrete_node_ids.size(); ++i) { + const auto node_id = concrete_node_ids[i]; + const auto& range = result.node_ranges[i]; + + for (auto i = (range.first - min); i < (range.second - min); i++) { + result.ids.emplace_back(make_key(node_id, element_ids[i])); + } + } + + // Temp. fix: When you ask for a large hyperslab in a dataset and then move + // to another dataset in the same file where you also ask for + // another large range, the next IOps take an extra few seconds. + // We observed that fooling HDF5 hides the issue, but we should + // verify this behaviour once new releases of HDF5 are available. + dataset_elem_ids.select({min}, {1}).read(element_ids); + } + + return result; } template @@ -341,38 +399,15 @@ std::pair ReportReader::Population::getIndex( template typename DataFrame::DataType ReportReader::Population::getNodeIdElementIdMapping( - const nonstd::optional& selection, std::function fn) const { - typename DataFrame::DataType ids{}; - - Selection::Values node_ids = node_ids_from_selection(selection); - - auto dataset_elem_ids = pop_group_.getGroup("mapping").getDataSet("element_ids"); - for (const auto& node_id : node_ids) { - const auto it = nodes_pointers_.find(node_id); - if (it == nodes_pointers_.end()) { - continue; - } - - std::vector element_ids(it->second.second - it->second.first); - dataset_elem_ids.select({it->second.first}, {it->second.second - it->second.first}) - .read(element_ids.data()); - for (const auto& elem : element_ids) { - ids.push_back(make_key(node_id, elem)); - } - - if (fn) { - fn(it->second); - } - } - return ids; + const nonstd::optional& node_ids) const { + return getNodeIdElementLayout(node_ids).ids; } template -DataFrame ReportReader::Population::get(const nonstd::optional& selection, +DataFrame ReportReader::Population::get(const nonstd::optional& node_ids, const nonstd::optional& tstart, const nonstd::optional& tstop, const nonstd::optional& tstride) const { - DataFrame data_frame; size_t index_start = 0; size_t index_stop = 0; std::tie(index_start, index_stop) = getIndex(tstart, tstop); @@ -384,24 +419,26 @@ DataFrame ReportReader::Population::get(const nonstd::optional& throw SonataError("tstart should be <= to tstop"); } - for (size_t i = index_start; i <= index_stop; i += stride) { - data_frame.times.push_back(times_index_[i].second); - } - // min and max offsets of the node_ids requested are calculated // to reduce the amount of IO that is brought to memory - Ranges positions; - uint64_t min = std::numeric_limits::max(); - uint64_t max = std::numeric_limits::min(); - data_frame.ids = getNodeIdElementIdMapping(selection, [&](const Range& range) { - min = std::min(range.first, min); - max = std::max(range.second, max); - positions.emplace_back(range.first, range.second); - }); - if (data_frame.ids.empty()) { // At the end no data available (wrong node_ids?) + const auto node_id_element_layout = getNodeIdElementLayout(node_ids); + const auto& node_ranges = node_id_element_layout.node_ranges; + const auto min = node_id_element_layout.min_max_range.first; + const auto max = node_id_element_layout.min_max_range.second; + + if (node_id_element_layout.ids.empty()) { // At the end no data available (wrong node_ids?) return DataFrame{{}, {}, {}}; } + // Fill times + DataFrame data_frame; + for (size_t i = index_start; i <= index_stop; i += stride) { + data_frame.times.push_back(times_index_[i].second); + } + + // Fill ids + data_frame.ids = node_id_element_layout.ids; + // Fill .data member size_t n_time_entries = ((index_stop - index_start) / stride) + 1; size_t n_ids = data_frame.ids.size(); @@ -414,6 +451,7 @@ DataFrame ReportReader::Population::get(const nonstd::optional& fmt::format("DataType of dataset 'data' should be Float32 ('{}' was found)", dataset_type.string())); } + std::vector buffer(max - min); for (size_t timer_index = index_start; timer_index <= index_stop; timer_index += stride) { // Note: The code assumes that the file is chunked by rows and not by columns @@ -423,20 +461,21 @@ DataFrame ReportReader::Population::get(const nonstd::optional& off_t offset = 0; off_t data_offset = (timer_index - index_start) / stride; auto data_ptr = &data_frame.data[data_offset * n_ids]; - for (const auto& position : positions) { - uint64_t elements_per_gid = position.second - position.first; - uint64_t gid_start = position.first - min; + for (const auto& range : node_ranges) { + uint64_t elements_per_gid = range.second - range.first; + uint64_t gid_start = range.first - min; // Soma report if (elements_per_gid == 1) { data_ptr[offset] = buffer[gid_start]; } else { // Elements report - uint64_t gid_end = position.second - min; + uint64_t gid_end = range.second - min; std::copy(&buffer[gid_start], &buffer[gid_end], &data_ptr[offset]); } offset += elements_per_gid; } } + return data_frame; }