Only do a single read when getting a report frame (#174)

* previously, each NodeID's data was being read individual, meaning that the number of HDF5 reads was large * switch to make a single read per frame (ie: timestep); this means the min/max extent is read into memory * for low number of NodeIDs, depending on their physical layout, more data may be read in; however, with more than ~1000 IDs, this new method is always faster, so it's definitely worthwhile * finally, this reduces the filesystem load significantly
BlueBrain · Feb 9, 2022 · 2863397 · 2863397
1 parent b3c2400
commit 2863397
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 71 deletions.
diff --git a/include/bbp/sonata/report_reader.h b/include/bbp/sonata/report_reader.h
@@ -102,9 +102,6 @@ template <typename KeyType>
 class SONATA_API ReportReader
 {
   public:
-    using Range = std::pair<uint64_t, uint64_t>;
-    using Ranges = std::vector<Range>;
-
     class Population
     {
       public:
@@ -144,11 +141,9 @@ class SONATA_API ReportReader
          *
          * \param node_ids limit the report to the given selection. If nullptr, all nodes in the
          * report are used
-         * \param fn lambda applied to all ranges for all node ids
          */
         typename DataFrame<KeyType>::DataType getNodeIdElementIdMapping(
-            const nonstd::optional<Selection>& node_ids = nonstd::nullopt,
-            std::function<void(const Range&)> fn = nullptr) const;
+            const nonstd::optional<Selection>& node_ids = nonstd::nullopt) const;
 
         /**
          * \param node_ids limit the report to the given selection.
@@ -163,20 +158,34 @@ class SONATA_API ReportReader
                                const nonstd::optional<size_t>& tstride = nonstd::nullopt) const;
 
       private:
+        struct NodeIdElementLayout {
+            typename DataFrame<KeyType>::DataType ids;
+            Selection::Ranges node_ranges;
+            Selection::Range min_max_range;
+        };
+
         Population(const H5::File& file, const std::string& populationName);
         std::pair<size_t, size_t> getIndex(const nonstd::optional<double>& tstart,
                                            const nonstd::optional<double>& tstop) const;
+        /**
+         * Return the element IDs for the given selection, alongside the filtered node pointers
+         * and the range of positions where they fit in the file. This latter two are necessary
+         * for performance to understand how and where to retrieve the data from storage.
+         *
+         * \param node_ids limit the report to the given selection. If nullptr, all nodes in the
+         * report are used
+         */
+        NodeIdElementLayout getNodeIdElementLayout(
+            const nonstd::optional<Selection>& node_ids = nonstd::nullopt) const;
 
-        std::map<NodeID, Range> nodes_pointers_;
+        std::map<NodeID, Selection::Range> node_ranges_;
         H5::Group pop_group_;
         std::vector<NodeID> nodes_ids_;
         double tstart_, tstop_, tstep_;
         std::vector<std::pair<size_t, double>> times_index_;
         std::string time_units_;
         std::string data_units_;
         bool nodes_ids_sorted_ = false;
-        Selection::Values node_ids_from_selection(
-            const nonstd::optional<Selection>& node_ids = nonstd::nullopt) const;
 
         friend ReportReader;
     };

diff --git a/python/bindings.cpp b/python/bindings.cpp
@@ -368,7 +368,7 @@ void bindReportReader(py::module& m, const std::string& prefix) {
             "get_node_id_element_id_mapping",
             [](const typename ReportType::Population& population,
                const nonstd::optional<Selection>& selection) {
-                return population.getNodeIdElementIdMapping(selection, nullptr);
+                return population.getNodeIdElementIdMapping(selection);
             },
             DOC_REPORTREADER_POP(getNodeIdElementIdMapping),
             "selection"_a = nonstd::nullopt)

diff --git a/src/report_reader.cpp b/src/report_reader.cpp
@@ -231,8 +231,8 @@ ReportReader<T>::Population::Population(const H5::File& file, const std::string&
         mapping_group.getDataSet("index_pointers").read(index_pointers);
 
         for (size_t i = 0; i < nodes_ids_.size(); ++i) {
-            nodes_pointers_.emplace(nodes_ids_[i],
-                                    std::make_pair(index_pointers[i], index_pointers[i + 1]));
+            node_ranges_.emplace(nodes_ids_[i],
+                                 std::make_pair(index_pointers[i], index_pointers[i + 1]));
         }
 
         {  // Get times
@@ -284,24 +284,82 @@ std::vector<NodeID> ReportReader<T>::Population::getNodeIds() const {
 }
 
 template <typename T>
-Selection::Values ReportReader<T>::Population::node_ids_from_selection(
-    const nonstd::optional<Selection>& selection) const {
-    Selection::Values node_ids;
-
-    if (!selection) {  // Take all nodes in this case
-        node_ids.reserve(nodes_pointers_.size());
-        std::transform(nodes_pointers_.begin(),
-                       nodes_pointers_.end(),
-                       std::back_inserter(node_ids),
-                       [](const std::pair<NodeID, Range>& node_pointer) {
-                           return node_pointer.first;
-                       });
-    } else if (selection->empty()) {
-        return {};
+typename ReportReader<T>::Population::NodeIdElementLayout
+ReportReader<T>::Population::getNodeIdElementLayout(
+    const nonstd::optional<Selection>& node_ids) const {
+    std::vector<NodeID> concrete_node_ids;
+    NodeIdElementLayout result;
+    result.min_max_range = {std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::min()};
+    size_t element_ids_count = 0;
+
+    // Helper function to update layout, alongside the min / max values and count
+    const auto update_layout = [&](const NodeID& node_id, const Selection::Range& range) {
+        concrete_node_ids.emplace_back(node_id);
+        result.node_ranges.emplace_back(range);
+
+        result.min_max_range.first = std::min(result.min_max_range.first, range.first);
+        result.min_max_range.second = std::max(result.min_max_range.second, range.second);
+        element_ids_count += (range.second - range.first);
+    };
+
+    // Take all nodes if no selection is provided
+    if (!node_ids) {
+        concrete_node_ids.reserve(node_ranges_.size());
+        result.node_ranges.reserve(node_ranges_.size());
+
+        for (const auto& node_range : node_ranges_) {
+            update_layout(node_range.first, node_range.second);
+        }
+    } else if (!node_ids->empty()) {
+        const auto selected_node_ids = node_ids->flatten();
+
+        // Reserve space for all the requested IDs and shrink afterwards
+        concrete_node_ids.reserve(selected_node_ids.size());
+        result.node_ranges.reserve(selected_node_ids.size());
+
+        for (const auto node_id : selected_node_ids) {
+            const auto it = node_ranges_.find(node_id);
+            if (it != node_ranges_.end()) {
+                update_layout(it->first, it->second);
+            }
+        }
+
+        concrete_node_ids.shrink_to_fit();
+        result.node_ranges.shrink_to_fit();
     } else {
-        node_ids = selection->flatten();
+        // node_ids Selection exists, but is empty
     }
-    return node_ids;
+
+    // Extract the ElementIDs from the GIDs
+    if (!result.node_ranges.empty()) {
+        const auto min = result.min_max_range.first;
+        const auto max = result.min_max_range.second;
+
+        std::vector<ElementID> element_ids;
+        auto dataset_elem_ids = pop_group_.getGroup("mapping").getDataSet("element_ids");
+        dataset_elem_ids.select({min}, {max - min}).read(element_ids);
+
+        result.ids.reserve(element_ids_count);
+
+        for (size_t i = 0; i < concrete_node_ids.size(); ++i) {
+            const auto node_id = concrete_node_ids[i];
+            const auto& range = result.node_ranges[i];
+
+            for (auto i = (range.first - min); i < (range.second - min); i++) {
+                result.ids.emplace_back(make_key<T>(node_id, element_ids[i]));
+            }
+        }
+
+        // Temp. fix: When you ask for a large hyperslab in a dataset and then move
+        //            to another dataset in the same file where you also ask for
+        //            another large range, the next IOps take an extra few seconds.
+        //            We observed that fooling HDF5 hides the issue, but we should
+        //            verify this behaviour once new releases of HDF5 are available.
+        dataset_elem_ids.select({min}, {1}).read(element_ids);
+    }
+
+    return result;
 }
 
 template <typename T>
@@ -341,38 +399,15 @@ std::pair<size_t, size_t> ReportReader<T>::Population::getIndex(
 
 template <typename T>
 typename DataFrame<T>::DataType ReportReader<T>::Population::getNodeIdElementIdMapping(
-    const nonstd::optional<Selection>& selection, std::function<void(const Range&)> fn) const {
-    typename DataFrame<T>::DataType ids{};
-
-    Selection::Values node_ids = node_ids_from_selection(selection);
-
-    auto dataset_elem_ids = pop_group_.getGroup("mapping").getDataSet("element_ids");
-    for (const auto& node_id : node_ids) {
-        const auto it = nodes_pointers_.find(node_id);
-        if (it == nodes_pointers_.end()) {
-            continue;
-        }
-
-        std::vector<ElementID> element_ids(it->second.second - it->second.first);
-        dataset_elem_ids.select({it->second.first}, {it->second.second - it->second.first})
-            .read(element_ids.data());
-        for (const auto& elem : element_ids) {
-            ids.push_back(make_key<T>(node_id, elem));
-        }
-
-        if (fn) {
-            fn(it->second);
-        }
-    }
-    return ids;
+    const nonstd::optional<Selection>& node_ids) const {
+    return getNodeIdElementLayout(node_ids).ids;
 }
 
 template <typename T>
-DataFrame<T> ReportReader<T>::Population::get(const nonstd::optional<Selection>& selection,
+DataFrame<T> ReportReader<T>::Population::get(const nonstd::optional<Selection>& node_ids,
                                               const nonstd::optional<double>& tstart,
                                               const nonstd::optional<double>& tstop,
                                               const nonstd::optional<size_t>& tstride) const {
-    DataFrame<T> data_frame;
     size_t index_start = 0;
     size_t index_stop = 0;
     std::tie(index_start, index_stop) = getIndex(tstart, tstop);
@@ -384,24 +419,26 @@ DataFrame<T> ReportReader<T>::Population::get(const nonstd::optional<Selection>&
         throw SonataError("tstart should be <= to tstop");
     }
 
-    for (size_t i = index_start; i <= index_stop; i += stride) {
-        data_frame.times.push_back(times_index_[i].second);
-    }
-
     // min and max offsets of the node_ids requested are calculated
     // to reduce the amount of IO that is brought to memory
-    Ranges positions;
-    uint64_t min = std::numeric_limits<uint64_t>::max();
-    uint64_t max = std::numeric_limits<uint64_t>::min();
-    data_frame.ids = getNodeIdElementIdMapping(selection, [&](const Range& range) {
-        min = std::min(range.first, min);
-        max = std::max(range.second, max);
-        positions.emplace_back(range.first, range.second);
-    });
-    if (data_frame.ids.empty()) {  // At the end no data available (wrong node_ids?)
+    const auto node_id_element_layout = getNodeIdElementLayout(node_ids);
+    const auto& node_ranges = node_id_element_layout.node_ranges;
+    const auto min = node_id_element_layout.min_max_range.first;
+    const auto max = node_id_element_layout.min_max_range.second;
+
+    if (node_id_element_layout.ids.empty()) {  // At the end no data available (wrong node_ids?)
         return DataFrame<T>{{}, {}, {}};
     }
 
+    // Fill times
+    DataFrame<T> data_frame;
+    for (size_t i = index_start; i <= index_stop; i += stride) {
+        data_frame.times.push_back(times_index_[i].second);
+    }
+
+    // Fill ids
+    data_frame.ids = node_id_element_layout.ids;
+
     // Fill .data member
     size_t n_time_entries = ((index_stop - index_start) / stride) + 1;
     size_t n_ids = data_frame.ids.size();
@@ -414,6 +451,7 @@ DataFrame<T> ReportReader<T>::Population::get(const nonstd::optional<Selection>&
             fmt::format("DataType of dataset 'data' should be Float32 ('{}' was found)",
                         dataset_type.string()));
     }
+
     std::vector<float> buffer(max - min);
     for (size_t timer_index = index_start; timer_index <= index_stop; timer_index += stride) {
         // Note: The code assumes that the file is chunked by rows and not by columns
@@ -423,20 +461,21 @@ DataFrame<T> ReportReader<T>::Population::get(const nonstd::optional<Selection>&
         off_t offset = 0;
         off_t data_offset = (timer_index - index_start) / stride;
         auto data_ptr = &data_frame.data[data_offset * n_ids];
-        for (const auto& position : positions) {
-            uint64_t elements_per_gid = position.second - position.first;
-            uint64_t gid_start = position.first - min;
+        for (const auto& range : node_ranges) {
+            uint64_t elements_per_gid = range.second - range.first;
+            uint64_t gid_start = range.first - min;
 
             // Soma report
             if (elements_per_gid == 1) {
                 data_ptr[offset] = buffer[gid_start];
             } else {  // Elements report
-                uint64_t gid_end = position.second - min;
+                uint64_t gid_end = range.second - min;
                 std::copy(&buffer[gid_start], &buffer[gid_end], &data_ptr[offset]);
             }
             offset += elements_per_gid;
         }
     }
+
     return data_frame;
 }