UCL-CCS · wedeling · Oct 1, 2020 · Oct 1, 2020 · Oct 1, 2020 · Oct 28, 2020
diff --git a/easyvvuq/analysis/qmc_analysis.py b/easyvvuq/analysis/qmc_analysis.py
@@ -5,6 +5,8 @@
 """
 import logging
 import numpy as np
+import pandas as pd
+from scipy.stats import norm
 from easyvvuq import OutputType
 from .base import BaseAnalysisElement
 from easyvvuq.sampling import QMCSampler
@@ -48,7 +50,7 @@ def element_version(self):
         """Version of this element"""
         return "0.2"
 
-    def analyse(self, data_frame):
+    def analyse(self, data_frame, **kwargs):
         """Perform QMC analysis on a given pandas DataFrame.
 
         Parameters
@@ -63,10 +65,20 @@ def analyse(self, data_frame):
             ['statistical_moments', 'percentiles', 'sobol_indices',
              'correlation_matrices', 'output_distributions']
         """
-        if data_frame.empty:
+        if type(data_frame) == pd.DataFrame and data_frame.empty:
             raise RuntimeError(
                 "No data in data frame passed to analyse element")
 
+        # Extract output values for each quantity of interest from Dataframe
+        samples = self.get_samples(data_frame, **kwargs)
+
+        return self.compute_results(samples)
+
+    def compute_results(self, samples):
+        """
+
+        """
+
         qoi_cols = self.qoi_cols
 
         results = {
@@ -77,9 +89,6 @@ def analyse(self, data_frame):
             'conf_sobols_total': {k: {} for k in qoi_cols}
         }
 
-        # Extract output values for each quantity of interest from Dataframe
-        samples = self.get_samples(data_frame)
-
         # Compute descriptive statistics for each quantity of interest
         for k in qoi_cols:
             results['statistical_moments'][k] = {'mean': np.mean(samples[k], axis=0),
@@ -94,13 +103,13 @@ def analyse(self, data_frame):
 
         return results
 
-    def get_samples(self, data_frame):
+    def get_samples(self, data_frame, **kwargs):
         """
         Converts the Pandas dataframe into a dictionary.
 
         Parameters
         ----------
-        data_frame : the EasyVVUQ Pandas dataframe.
+        data_frame : the EasyVVUQ Pandas or dictonary dataframe.
 
         Returns
         -------
@@ -109,13 +118,54 @@ def get_samples(self, data_frame):
 
         """
         samples = {k: [] for k in self.qoi_cols}
-        for run_id in data_frame.run_id.unique():
+        if type(data_frame) == pd.DataFrame:
+            for run_id in data_frame.run_id.unique():
+                for k in self.qoi_cols:
+                    if 'output_index' in kwargs:
+                        values = data_frame.loc[data_frame['run_id'] == run_id][k].values[kwargs['output_index']]
+                    else:
+                        values = data_frame.loc[data_frame['run_id'] == run_id][k].values
+                    samples[k].append(values)
+        elif type(data_frame) == dict:
+            #dict is not sorted, make sure the runs are processed in ascending order
             for k in self.qoi_cols:
-                data = data_frame.loc[data_frame['run_id'] == run_id][k]
-                samples[k].append(data.values)
+                run_id_int = [int(run_id.split('Run_')[-1]) for run_id in data_frame[k].keys()]
+                for run_id in range(1, np.max(run_id_int) + 1):
+                    if 'output_index' in kwargs:
+                        samples[k].append(data_frame[k]['Run_' + str(run_id)][kwargs['output_index']])                       
+                    else:
+                        samples[k].append(data_frame[k]['Run_' + str(run_id)])
         return samples
+
+    def merge_campaigns(self, data_frames):
+        """
+        Merge the dataframes of multiple (Q)MC campaigns, and then compute the results.
+        Only use this if the same vary dict was used in the campaigns.
+
+        Parameters
+        ----------
+        data_frames : list of EasyVVUQ data frames
 
-    def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000):
+        Returns
+        -------
+        results : the results dictionary computed from the merged sample sets
+
+        """
+
+        assert type(data_frames) is list
+
+        samples = {k: [] for k in self.qoi_cols}
+        #loop over all data frames
+        for data_frame in data_frames:
+            #convert each data frame into a dict
+            sample_dict = self.get_samples(data_frame)
+            #for each QoI in the dict, merge the lists of samples
+            for k in self.qoi_cols:
+                samples[k] += sample_dict[k]
+        #compute the results using the merged sample set
+        return self.compute_results(samples)
+
+    def sobol_bootstrap(self, samples, alpha=0.05, n_bootstrap=1000):
         """
         Computes the first order and total order Sobol indices using Saltelli's
         method. To assess the sampling inaccuracy, bootstrap confidence intervals
@@ -143,43 +193,53 @@ def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000):
         assert len(samples) > 0
         assert alpha > 0.0
         assert alpha < 1.0
-        assert n_samples > 0
+        assert n_bootstrap > 0
 
         # convert to array
         samples = np.array(samples)
         # the number of parameter and the number of MC samples in n_mc * (n_params + 2)
         # and the size of the QoI
         n_params = self.sampler.n_params
-        n_mc = self.sampler.n_mc_samples
+        # n_mc = self.sampler.n_mc_samples
+        n_mc = int(samples.shape[0]/(n_params + 2))
         n_qoi = samples[0].size
         sobols_first_dict = {}
         conf_first_dict = {}
         sobols_total_dict = {}
         conf_total_dict = {}
 
+        # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
+        # see reference above.
+        f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)
+        r = np.random.randint(n_mc, size=(n_mc, n_bootstrap))
+
         for j, param_name in enumerate(self.sampler.vary.get_keys()):
-            # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
-            # see reference above.
-            f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)
+
             # our point estimate for the 1st and total order Sobol indices
             value_first = self._first_order(f_M2, f_M1, f_Ni[:, j])
             value_total = self._total_order(f_M2, f_M1, f_Ni[:, j])
-            # array for resampled estimates
-            sobols_first = np.zeros([n_samples, n_qoi])
-            sobols_total = np.zeros([n_samples, n_qoi])
-            for i in range(n_samples):
-                # resample, must be done on already seperated output due to
-                # the specific order in samples
-                idx = np.random.randint(0, n_mc - 1, n_mc)
-                f_M2_resample = f_M2[idx]
-                f_M1_resample = f_M1[idx]
-                f_Ni_resample = f_Ni[idx]
-                # recompute Sobol indices
-                sobols_first[i] = self._first_order(f_M2_resample, f_M1_resample,
-                                                    f_Ni_resample[:, j])
-                sobols_total[i] = self._total_order(f_M2_resample, f_M1_resample,
-                                                    f_Ni_resample[:, j])
-            # compute confidence intervals
+
+            # sobols computed from resampled data points
+            if n_mc * n_bootstrap * n_qoi <= 10**7:
+                #this is a vectorized computation, Is fast, but f_M2[r] will be of size
+                #(n_mc, n_bootstrap, n_qoi), this can become too large and cause a crash, 
+                #especially when dealing with large QoI (n_qoi >> 1). So this is only done
+                #when n_mc * n_bootstrap * n_qoi <= 10**7
+                print("Vectorized bootstrapping")
+                sobols_first = self._first_order(f_M2[r], f_M1[r], f_Ni[r, j])
+                sobols_total = self._total_order(f_M2[r], f_M1[r], f_Ni[r, j])
+            else:
+                #array for resampled estimates
+                sobols_first = np.zeros([n_bootstrap, n_qoi])
+                sobols_total = np.zeros([n_bootstrap, n_qoi])
+                print("Sequential bootstrapping")
+                #non-vectorized implementation
+                for i in range(n_bootstrap):
+                    #resampled sample matrices of size (n_mc, n_qoi)
+                    sobols_first[i] = self._first_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j])
+                    sobols_total[i] = self._total_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j])
+
+            # compute confidence intervals based on percentiles
             _, low_first, high_first = confidence_interval(sobols_first, value_first,
                                                            alpha, pivotal=True)
             _, low_total, high_total = confidence_interval(sobols_total, value_total,
@@ -235,6 +295,7 @@ def _separate_output_values(samples, n_params, n_mc_samples):
 
         return f_M2, f_M1, f_Ni
 
+    # Adapted from SALib
     @staticmethod
     def _first_order(f_M2, f_M1, f_Ni):
         """Calculate first order sensitivity indices.
@@ -255,6 +316,7 @@ def _first_order(f_M2, f_M1, f_Ni):
         V = np.var(np.r_[f_M2, f_M1], axis=0)
         return np.mean(f_M1 * (f_Ni - f_M2), axis=0) / (V + (V == 0)) * (V != 0)
 
+    # Adapted from SALib
     @staticmethod
     def _total_order(f_M2, f_M1, f_Ni):
         """Calculate total order sensitivity indices. See also:

diff --git a/easyvvuq/collate/__init__.py b/easyvvuq/collate/__init__.py
@@ -1,3 +1,4 @@
 from .base import BaseCollationElement
 from .aggregate_samples import AggregateSamples
 from .aggregate_by_variables import AggregateByVariables
+from .aggregate_hdf5 import AggregateHDF5
diff --git a/easyvvuq/collate/aggregate_hdf5.py b/easyvvuq/collate/aggregate_hdf5.py
@@ -0,0 +1,157 @@
+"""Provides an element for aggregation of results from all complete runs.
+"""
+
+import os
+from .base import BaseCollationElement
+from easyvvuq import OutputType, constants
+import h5py
+
+__copyright__ = """
+
+    Copyright 2018 Robin A. Richardson, David W. Wright
+
+    This file is part of EasyVVUQ
+
+    EasyVVUQ is free software: you can redistribute it and/or modify
+    it under the terms of the Lesser GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    EasyVVUQ is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    Lesser GNU General Public License for more details.
+
+    You should have received a copy of the Lesser GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+__license__ = "LGPL"
+
+
+class AggregateHDF5(BaseCollationElement, collater_name="aggregate_hdf5"):
+    """
+    Aggregate the results of all completed simulations described by the
+    Campaign. Will simply concatenate all simulation data into one big
+    HDF5 DataFrame. This dataframe is stored in the campaign directory.
+
+    """
+
+    def collate(self, campaign, app_id):
+        """
+        Collected the decoded run results for all completed runs with ENCODED status
+
+        Parameters
+        ----------
+        campaign : :obj:`easyvvuq.campaign.Campaign`
+            EasyVVUQ coordination object from which to get information on runs
+            to be collated.
+
+        Returns
+        -------
+        `int`:
+            The number of new data rows added during collation
+        """
+
+        print('Collating data...')
+        decoder = campaign._active_app_decoder
+
+        if decoder.output_type != OutputType.SAMPLE:
+            raise RuntimeError('Can only aggregate sample type data')
+
+        # Aggregate any uncollated runs into a dataframe (for appending to existing full df)
+        new_data = {}
+        qoi_cols = campaign._active_app_decoder.output_columns
+
+        # Loop through all runs with status ENCODED (and therefore not yet COLLATED)
+        processed_run_IDs = []
+        for run_id, run_info in campaign.campaign_db.runs(
+                status=constants.Status.ENCODED, app_id=app_id):
+
+            # Use decoder to check if run has completed (in general application-specific)
+            if decoder.sim_complete(run_info=run_info):
+
+                run_data = decoder.parse_sim_output(run_info=run_info)
+                new_data[run_id] = {}
+                for qoi in qoi_cols:
+                    new_data[run_id][qoi] = run_data[qoi].values
+                processed_run_IDs.append(run_id)
+
+        self.append_data(campaign, new_data)
+        campaign.campaign_db.set_run_statuses(processed_run_IDs, constants.Status.COLLATED)
+        print('done.')
+
+        return len(processed_run_IDs)
+
+    def append_data(self, campaign, new_data):
+        """
+
+        Parameters
+        ----------
+        campaign : EasyVVUQ campaign object
+        new_data : a dictionary containing the new simulation results for
+        each quantity of interest. The results are appended to the HDF5 file
+        samples.hdf5, located in the Campaign work directory
+
+        Returns
+        -------
+        None.
+
+        """
+        hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5')
+        #retrieve the names of the quantities of interest
+        qoi_cols = campaign._active_app_decoder.output_columns
+
+        h5f = h5py.File(hdf5_file, 'a')
+        for qoi in qoi_cols:
+            #Each QoI is stored in a separate group
+            if not qoi in h5f.keys():
+                h5f.create_group(qoi)
+            #Each QoI sample is stored in a dataset qoi/run_id
+            for run_id in new_data.keys():
+                h5f.create_dataset(qoi + '/' + run_id, data=new_data[run_id][qoi])
+        h5f.close()
+
+    def get_collated_dataframe(self, campaign, app_id):
+        """
+        Read the samples from the HDF5 file samples.hdf5, and store them in
+        a dictionary to be used in the analysis classes.
+
+        Parameters
+        ----------
+        campaign : EasyVVUQ Campaign object
+        app_id : the EasyVVUQ app id.
+
+        Returns
+        -------
+        data : the simulation samples stored in a dictionary. Format of the
+        dict is data[qoi][run_id], e.g. data['velocity']['Run_1']
+
+        """
+
+        hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5')
+        qoi_cols = campaign._active_app_decoder.output_columns
+        h5f = h5py.File(hdf5_file, 'r')
+        data = {}
+        for qoi in qoi_cols:
+            data[qoi] = {}
+            for run_id in h5f[qoi].keys():
+                data[qoi][run_id] = h5f[qoi + '/' + run_id][()]
+        h5f.close()
+        return data
+
+    def element_version(self):
+        return "0.1"
+
+    def is_restartable(self):
+        return True
+
+    def get_restart_dict(self):
+        """Return dict required for restart from serlialized form.
+
+        Returns
+        -------
+        dict:
+            No restart parameters required
+        """
+        return {}
diff --git a/easyvvuq/sampling/mc_sampler.py b/easyvvuq/sampling/mc_sampler.py
@@ -113,7 +113,7 @@ def saltelli(self, n_mc):
         self.xi_mc[(step - 1):self.max_num:step] = M_1
         # store N_i entries between M2 and M1
         for i in range(self.n_params):
-            N_i = np.array(M_2)
+            N_i = np.copy(M_2)
             # N_i = M2 with i-th colum from M1
             N_i[:, i] = M_1[:, i]
             self.xi_mc[(i + 1):self.max_num:step] = N_i