diff --git a/easyvvuq/analysis/qmc_analysis.py b/easyvvuq/analysis/qmc_analysis.py index 8edb3214a..8db62f2e3 100644 --- a/easyvvuq/analysis/qmc_analysis.py +++ b/easyvvuq/analysis/qmc_analysis.py @@ -5,6 +5,8 @@ """ import logging import numpy as np +import pandas as pd +from scipy.stats import norm from easyvvuq import OutputType from .base import BaseAnalysisElement from easyvvuq.sampling import QMCSampler @@ -48,7 +50,7 @@ def element_version(self): """Version of this element""" return "0.2" - def analyse(self, data_frame): + def analyse(self, data_frame, **kwargs): """Perform QMC analysis on a given pandas DataFrame. Parameters @@ -63,10 +65,20 @@ def analyse(self, data_frame): ['statistical_moments', 'percentiles', 'sobol_indices', 'correlation_matrices', 'output_distributions'] """ - if data_frame.empty: + if type(data_frame) == pd.DataFrame and data_frame.empty: raise RuntimeError( "No data in data frame passed to analyse element") + # Extract output values for each quantity of interest from Dataframe + samples = self.get_samples(data_frame, **kwargs) + + return self.compute_results(samples) + + def compute_results(self, samples): + """ + + """ + qoi_cols = self.qoi_cols results = { @@ -77,9 +89,6 @@ def analyse(self, data_frame): 'conf_sobols_total': {k: {} for k in qoi_cols} } - # Extract output values for each quantity of interest from Dataframe - samples = self.get_samples(data_frame) - # Compute descriptive statistics for each quantity of interest for k in qoi_cols: results['statistical_moments'][k] = {'mean': np.mean(samples[k], axis=0), @@ -94,13 +103,13 @@ def analyse(self, data_frame): return results - def get_samples(self, data_frame): + def get_samples(self, data_frame, **kwargs): """ Converts the Pandas dataframe into a dictionary. Parameters ---------- - data_frame : the EasyVVUQ Pandas dataframe. + data_frame : the EasyVVUQ Pandas or dictonary dataframe. Returns ------- @@ -109,13 +118,54 @@ def get_samples(self, data_frame): """ samples = {k: [] for k in self.qoi_cols} - for run_id in data_frame.run_id.unique(): + if type(data_frame) == pd.DataFrame: + for run_id in data_frame.run_id.unique(): + for k in self.qoi_cols: + if 'output_index' in kwargs: + values = data_frame.loc[data_frame['run_id'] == run_id][k].values[kwargs['output_index']] + else: + values = data_frame.loc[data_frame['run_id'] == run_id][k].values + samples[k].append(values) + elif type(data_frame) == dict: + #dict is not sorted, make sure the runs are processed in ascending order for k in self.qoi_cols: - data = data_frame.loc[data_frame['run_id'] == run_id][k] - samples[k].append(data.values) + run_id_int = [int(run_id.split('Run_')[-1]) for run_id in data_frame[k].keys()] + for run_id in range(1, np.max(run_id_int) + 1): + if 'output_index' in kwargs: + samples[k].append(data_frame[k]['Run_' + str(run_id)][kwargs['output_index']]) + else: + samples[k].append(data_frame[k]['Run_' + str(run_id)]) return samples + + def merge_campaigns(self, data_frames): + """ + Merge the dataframes of multiple (Q)MC campaigns, and then compute the results. + Only use this if the same vary dict was used in the campaigns. + + Parameters + ---------- + data_frames : list of EasyVVUQ data frames - def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000): + Returns + ------- + results : the results dictionary computed from the merged sample sets + + """ + + assert type(data_frames) is list + + samples = {k: [] for k in self.qoi_cols} + #loop over all data frames + for data_frame in data_frames: + #convert each data frame into a dict + sample_dict = self.get_samples(data_frame) + #for each QoI in the dict, merge the lists of samples + for k in self.qoi_cols: + samples[k] += sample_dict[k] + #compute the results using the merged sample set + return self.compute_results(samples) + + def sobol_bootstrap(self, samples, alpha=0.05, n_bootstrap=1000): """ Computes the first order and total order Sobol indices using Saltelli's method. To assess the sampling inaccuracy, bootstrap confidence intervals @@ -143,43 +193,53 @@ def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000): assert len(samples) > 0 assert alpha > 0.0 assert alpha < 1.0 - assert n_samples > 0 + assert n_bootstrap > 0 # convert to array samples = np.array(samples) # the number of parameter and the number of MC samples in n_mc * (n_params + 2) # and the size of the QoI n_params = self.sampler.n_params - n_mc = self.sampler.n_mc_samples + # n_mc = self.sampler.n_mc_samples + n_mc = int(samples.shape[0]/(n_params + 2)) n_qoi = samples[0].size sobols_first_dict = {} conf_first_dict = {} sobols_total_dict = {} conf_total_dict = {} + # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params + # see reference above. + f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc) + r = np.random.randint(n_mc, size=(n_mc, n_bootstrap)) + for j, param_name in enumerate(self.sampler.vary.get_keys()): - # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params - # see reference above. - f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc) + # our point estimate for the 1st and total order Sobol indices value_first = self._first_order(f_M2, f_M1, f_Ni[:, j]) value_total = self._total_order(f_M2, f_M1, f_Ni[:, j]) - # array for resampled estimates - sobols_first = np.zeros([n_samples, n_qoi]) - sobols_total = np.zeros([n_samples, n_qoi]) - for i in range(n_samples): - # resample, must be done on already seperated output due to - # the specific order in samples - idx = np.random.randint(0, n_mc - 1, n_mc) - f_M2_resample = f_M2[idx] - f_M1_resample = f_M1[idx] - f_Ni_resample = f_Ni[idx] - # recompute Sobol indices - sobols_first[i] = self._first_order(f_M2_resample, f_M1_resample, - f_Ni_resample[:, j]) - sobols_total[i] = self._total_order(f_M2_resample, f_M1_resample, - f_Ni_resample[:, j]) - # compute confidence intervals + + # sobols computed from resampled data points + if n_mc * n_bootstrap * n_qoi <= 10**7: + #this is a vectorized computation, Is fast, but f_M2[r] will be of size + #(n_mc, n_bootstrap, n_qoi), this can become too large and cause a crash, + #especially when dealing with large QoI (n_qoi >> 1). So this is only done + #when n_mc * n_bootstrap * n_qoi <= 10**7 + print("Vectorized bootstrapping") + sobols_first = self._first_order(f_M2[r], f_M1[r], f_Ni[r, j]) + sobols_total = self._total_order(f_M2[r], f_M1[r], f_Ni[r, j]) + else: + #array for resampled estimates + sobols_first = np.zeros([n_bootstrap, n_qoi]) + sobols_total = np.zeros([n_bootstrap, n_qoi]) + print("Sequential bootstrapping") + #non-vectorized implementation + for i in range(n_bootstrap): + #resampled sample matrices of size (n_mc, n_qoi) + sobols_first[i] = self._first_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j]) + sobols_total[i] = self._total_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j]) + + # compute confidence intervals based on percentiles _, low_first, high_first = confidence_interval(sobols_first, value_first, alpha, pivotal=True) _, low_total, high_total = confidence_interval(sobols_total, value_total, @@ -235,6 +295,7 @@ def _separate_output_values(samples, n_params, n_mc_samples): return f_M2, f_M1, f_Ni + # Adapted from SALib @staticmethod def _first_order(f_M2, f_M1, f_Ni): """Calculate first order sensitivity indices. @@ -255,6 +316,7 @@ def _first_order(f_M2, f_M1, f_Ni): V = np.var(np.r_[f_M2, f_M1], axis=0) return np.mean(f_M1 * (f_Ni - f_M2), axis=0) / (V + (V == 0)) * (V != 0) + # Adapted from SALib @staticmethod def _total_order(f_M2, f_M1, f_Ni): """Calculate total order sensitivity indices. See also: diff --git a/easyvvuq/collate/__init__.py b/easyvvuq/collate/__init__.py index 833b81632..42411d31e 100644 --- a/easyvvuq/collate/__init__.py +++ b/easyvvuq/collate/__init__.py @@ -1,3 +1,4 @@ from .base import BaseCollationElement from .aggregate_samples import AggregateSamples from .aggregate_by_variables import AggregateByVariables +from .aggregate_hdf5 import AggregateHDF5 diff --git a/easyvvuq/collate/aggregate_hdf5.py b/easyvvuq/collate/aggregate_hdf5.py new file mode 100644 index 000000000..a3f3dc1b2 --- /dev/null +++ b/easyvvuq/collate/aggregate_hdf5.py @@ -0,0 +1,157 @@ +"""Provides an element for aggregation of results from all complete runs. +""" + +import os +from .base import BaseCollationElement +from easyvvuq import OutputType, constants +import h5py + +__copyright__ = """ + + Copyright 2018 Robin A. Richardson, David W. Wright + + This file is part of EasyVVUQ + + EasyVVUQ is free software: you can redistribute it and/or modify + it under the terms of the Lesser GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + EasyVVUQ is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + Lesser GNU General Public License for more details. + + You should have received a copy of the Lesser GNU General Public License + along with this program. If not, see . + +""" +__license__ = "LGPL" + + +class AggregateHDF5(BaseCollationElement, collater_name="aggregate_hdf5"): + """ + Aggregate the results of all completed simulations described by the + Campaign. Will simply concatenate all simulation data into one big + HDF5 DataFrame. This dataframe is stored in the campaign directory. + + """ + + def collate(self, campaign, app_id): + """ + Collected the decoded run results for all completed runs with ENCODED status + + Parameters + ---------- + campaign : :obj:`easyvvuq.campaign.Campaign` + EasyVVUQ coordination object from which to get information on runs + to be collated. + + Returns + ------- + `int`: + The number of new data rows added during collation + """ + + print('Collating data...') + decoder = campaign._active_app_decoder + + if decoder.output_type != OutputType.SAMPLE: + raise RuntimeError('Can only aggregate sample type data') + + # Aggregate any uncollated runs into a dataframe (for appending to existing full df) + new_data = {} + qoi_cols = campaign._active_app_decoder.output_columns + + # Loop through all runs with status ENCODED (and therefore not yet COLLATED) + processed_run_IDs = [] + for run_id, run_info in campaign.campaign_db.runs( + status=constants.Status.ENCODED, app_id=app_id): + + # Use decoder to check if run has completed (in general application-specific) + if decoder.sim_complete(run_info=run_info): + + run_data = decoder.parse_sim_output(run_info=run_info) + new_data[run_id] = {} + for qoi in qoi_cols: + new_data[run_id][qoi] = run_data[qoi].values + processed_run_IDs.append(run_id) + + self.append_data(campaign, new_data) + campaign.campaign_db.set_run_statuses(processed_run_IDs, constants.Status.COLLATED) + print('done.') + + return len(processed_run_IDs) + + def append_data(self, campaign, new_data): + """ + + Parameters + ---------- + campaign : EasyVVUQ campaign object + new_data : a dictionary containing the new simulation results for + each quantity of interest. The results are appended to the HDF5 file + samples.hdf5, located in the Campaign work directory + + Returns + ------- + None. + + """ + hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5') + #retrieve the names of the quantities of interest + qoi_cols = campaign._active_app_decoder.output_columns + + h5f = h5py.File(hdf5_file, 'a') + for qoi in qoi_cols: + #Each QoI is stored in a separate group + if not qoi in h5f.keys(): + h5f.create_group(qoi) + #Each QoI sample is stored in a dataset qoi/run_id + for run_id in new_data.keys(): + h5f.create_dataset(qoi + '/' + run_id, data=new_data[run_id][qoi]) + h5f.close() + + def get_collated_dataframe(self, campaign, app_id): + """ + Read the samples from the HDF5 file samples.hdf5, and store them in + a dictionary to be used in the analysis classes. + + Parameters + ---------- + campaign : EasyVVUQ Campaign object + app_id : the EasyVVUQ app id. + + Returns + ------- + data : the simulation samples stored in a dictionary. Format of the + dict is data[qoi][run_id], e.g. data['velocity']['Run_1'] + + """ + + hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5') + qoi_cols = campaign._active_app_decoder.output_columns + h5f = h5py.File(hdf5_file, 'r') + data = {} + for qoi in qoi_cols: + data[qoi] = {} + for run_id in h5f[qoi].keys(): + data[qoi][run_id] = h5f[qoi + '/' + run_id][()] + h5f.close() + return data + + def element_version(self): + return "0.1" + + def is_restartable(self): + return True + + def get_restart_dict(self): + """Return dict required for restart from serlialized form. + + Returns + ------- + dict: + No restart parameters required + """ + return {} diff --git a/easyvvuq/sampling/mc_sampler.py b/easyvvuq/sampling/mc_sampler.py index de9ee2f12..a6bfa2929 100644 --- a/easyvvuq/sampling/mc_sampler.py +++ b/easyvvuq/sampling/mc_sampler.py @@ -113,7 +113,7 @@ def saltelli(self, n_mc): self.xi_mc[(step - 1):self.max_num:step] = M_1 # store N_i entries between M2 and M1 for i in range(self.n_params): - N_i = np.array(M_2) + N_i = np.copy(M_2) # N_i = M2 with i-th colum from M1 N_i[:, i] = M_1[:, i] self.xi_mc[(i + 1):self.max_num:step] = N_i diff --git a/tests/test_mc_analysis.py b/tests/test_mc_analysis.py index 546724e03..7405c965d 100644 --- a/tests/test_mc_analysis.py +++ b/tests/test_mc_analysis.py @@ -75,14 +75,14 @@ def test_sobol_bootstrap(data): assert(s1['x2'] == pytest.approx(0.20727553481694053, 0.01)) assert(st['x1'] == pytest.approx(0.8132793654841785, 0.01)) assert(st['x2'] == pytest.approx(0.3804962894947435, 0.01)) - assert(s1_conf['x1']['low'][0] == pytest.approx(0.14387035, 0.01)) - assert(s1_conf['x1']['high'][0] == pytest.approx(0.89428774, 0.01)) - assert(s1_conf['x2']['low'][0] == pytest.approx(-0.11063341, 0.01)) - assert(s1_conf['x2']['high'][0] == pytest.approx(0.46752829, 0.01)) - assert(st_conf['x1']['low'][0] == pytest.approx(0.61368887, 0.01)) - assert(st_conf['x1']['high'][0] == pytest.approx(1.01858671, 0.01)) - assert(st_conf['x2']['low'][0] == pytest.approx(0.24361207, 0.01)) - assert(st_conf['x2']['high'][0] == pytest.approx(0.49214117, 0.01)) + assert(s1_conf['x1']['low'] == pytest.approx(0.14387035, 0.01)) + assert(s1_conf['x1']['high'] == pytest.approx(0.89428774, 0.01)) + assert(s1_conf['x2']['low'] == pytest.approx(-0.11063341, 0.01)) + assert(s1_conf['x2']['high'] == pytest.approx(0.46752829, 0.01)) + assert(st_conf['x1']['low'] == pytest.approx(0.61368887, 0.01)) + assert(st_conf['x1']['high'] == pytest.approx(1.01858671, 0.01)) + assert(st_conf['x2']['low'] == pytest.approx(0.24361207, 0.01)) + assert(st_conf['x2']['high'] == pytest.approx(0.49214117, 0.01)) def test_separate_output_values(data):