Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Random sampler analysis class #273

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 94 additions & 32 deletions easyvvuq/analysis/qmc_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
"""
import logging
import numpy as np
import pandas as pd
from scipy.stats import norm
from easyvvuq import OutputType
from .base import BaseAnalysisElement
from easyvvuq.sampling import QMCSampler
Expand Down Expand Up @@ -48,7 +50,7 @@ def element_version(self):
"""Version of this element"""
return "0.2"

def analyse(self, data_frame):
def analyse(self, data_frame, **kwargs):
"""Perform QMC analysis on a given pandas DataFrame.

Parameters
Expand All @@ -63,10 +65,20 @@ def analyse(self, data_frame):
['statistical_moments', 'percentiles', 'sobol_indices',
'correlation_matrices', 'output_distributions']
"""
if data_frame.empty:
if type(data_frame) == pd.DataFrame and data_frame.empty:
raise RuntimeError(
"No data in data frame passed to analyse element")

# Extract output values for each quantity of interest from Dataframe
samples = self.get_samples(data_frame, **kwargs)

return self.compute_results(samples)

def compute_results(self, samples):
"""

"""

qoi_cols = self.qoi_cols

results = {
Expand All @@ -77,9 +89,6 @@ def analyse(self, data_frame):
'conf_sobols_total': {k: {} for k in qoi_cols}
}

# Extract output values for each quantity of interest from Dataframe
samples = self.get_samples(data_frame)

# Compute descriptive statistics for each quantity of interest
for k in qoi_cols:
results['statistical_moments'][k] = {'mean': np.mean(samples[k], axis=0),
Expand All @@ -94,13 +103,13 @@ def analyse(self, data_frame):

return results

def get_samples(self, data_frame):
def get_samples(self, data_frame, **kwargs):
"""
Converts the Pandas dataframe into a dictionary.

Parameters
----------
data_frame : the EasyVVUQ Pandas dataframe.
data_frame : the EasyVVUQ Pandas or dictonary dataframe.

Returns
-------
Expand All @@ -109,13 +118,54 @@ def get_samples(self, data_frame):

"""
samples = {k: [] for k in self.qoi_cols}
for run_id in data_frame.run_id.unique():
if type(data_frame) == pd.DataFrame:
for run_id in data_frame.run_id.unique():
for k in self.qoi_cols:
if 'output_index' in kwargs:
values = data_frame.loc[data_frame['run_id'] == run_id][k].values[kwargs['output_index']]
else:
values = data_frame.loc[data_frame['run_id'] == run_id][k].values
samples[k].append(values)
elif type(data_frame) == dict:
#dict is not sorted, make sure the runs are processed in ascending order
for k in self.qoi_cols:
data = data_frame.loc[data_frame['run_id'] == run_id][k]
samples[k].append(data.values)
run_id_int = [int(run_id.split('Run_')[-1]) for run_id in data_frame[k].keys()]
for run_id in range(1, np.max(run_id_int) + 1):
if 'output_index' in kwargs:
samples[k].append(data_frame[k]['Run_' + str(run_id)][kwargs['output_index']])
else:
samples[k].append(data_frame[k]['Run_' + str(run_id)])
return samples

def merge_campaigns(self, data_frames):
"""
Merge the dataframes of multiple (Q)MC campaigns, and then compute the results.
Only use this if the same vary dict was used in the campaigns.

Parameters
----------
data_frames : list of EasyVVUQ data frames

def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000):
Returns
-------
results : the results dictionary computed from the merged sample sets

"""

assert type(data_frames) is list

samples = {k: [] for k in self.qoi_cols}
#loop over all data frames
for data_frame in data_frames:
#convert each data frame into a dict
sample_dict = self.get_samples(data_frame)
#for each QoI in the dict, merge the lists of samples
for k in self.qoi_cols:
samples[k] += sample_dict[k]
#compute the results using the merged sample set
return self.compute_results(samples)

def sobol_bootstrap(self, samples, alpha=0.05, n_bootstrap=1000):
"""
Computes the first order and total order Sobol indices using Saltelli's
method. To assess the sampling inaccuracy, bootstrap confidence intervals
Expand Down Expand Up @@ -143,43 +193,53 @@ def sobol_bootstrap(self, samples, alpha=0.05, n_samples=1000):
assert len(samples) > 0
assert alpha > 0.0
assert alpha < 1.0
assert n_samples > 0
assert n_bootstrap > 0

# convert to array
samples = np.array(samples)
# the number of parameter and the number of MC samples in n_mc * (n_params + 2)
# and the size of the QoI
n_params = self.sampler.n_params
n_mc = self.sampler.n_mc_samples
# n_mc = self.sampler.n_mc_samples
n_mc = int(samples.shape[0]/(n_params + 2))
n_qoi = samples[0].size
sobols_first_dict = {}
conf_first_dict = {}
sobols_total_dict = {}
conf_total_dict = {}

# code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
# see reference above.
f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)
r = np.random.randint(n_mc, size=(n_mc, n_bootstrap))

for j, param_name in enumerate(self.sampler.vary.get_keys()):
# code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
# see reference above.
f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)

# our point estimate for the 1st and total order Sobol indices
value_first = self._first_order(f_M2, f_M1, f_Ni[:, j])
value_total = self._total_order(f_M2, f_M1, f_Ni[:, j])
# array for resampled estimates
sobols_first = np.zeros([n_samples, n_qoi])
sobols_total = np.zeros([n_samples, n_qoi])
for i in range(n_samples):
# resample, must be done on already seperated output due to
# the specific order in samples
idx = np.random.randint(0, n_mc - 1, n_mc)
f_M2_resample = f_M2[idx]
f_M1_resample = f_M1[idx]
f_Ni_resample = f_Ni[idx]
# recompute Sobol indices
sobols_first[i] = self._first_order(f_M2_resample, f_M1_resample,
f_Ni_resample[:, j])
sobols_total[i] = self._total_order(f_M2_resample, f_M1_resample,
f_Ni_resample[:, j])
# compute confidence intervals

# sobols computed from resampled data points
if n_mc * n_bootstrap * n_qoi <= 10**7:
#this is a vectorized computation, Is fast, but f_M2[r] will be of size
#(n_mc, n_bootstrap, n_qoi), this can become too large and cause a crash,
#especially when dealing with large QoI (n_qoi >> 1). So this is only done
#when n_mc * n_bootstrap * n_qoi <= 10**7
print("Vectorized bootstrapping")
sobols_first = self._first_order(f_M2[r], f_M1[r], f_Ni[r, j])
sobols_total = self._total_order(f_M2[r], f_M1[r], f_Ni[r, j])
else:
#array for resampled estimates
sobols_first = np.zeros([n_bootstrap, n_qoi])
sobols_total = np.zeros([n_bootstrap, n_qoi])
print("Sequential bootstrapping")
#non-vectorized implementation
for i in range(n_bootstrap):
#resampled sample matrices of size (n_mc, n_qoi)
sobols_first[i] = self._first_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j])
sobols_total[i] = self._total_order(f_M2[r[i]], f_M1[r[i]], f_Ni[r[i], j])

# compute confidence intervals based on percentiles
_, low_first, high_first = confidence_interval(sobols_first, value_first,
alpha, pivotal=True)
_, low_total, high_total = confidence_interval(sobols_total, value_total,
Expand Down Expand Up @@ -235,6 +295,7 @@ def _separate_output_values(samples, n_params, n_mc_samples):

return f_M2, f_M1, f_Ni

# Adapted from SALib
@staticmethod
def _first_order(f_M2, f_M1, f_Ni):
"""Calculate first order sensitivity indices.
Expand All @@ -255,6 +316,7 @@ def _first_order(f_M2, f_M1, f_Ni):
V = np.var(np.r_[f_M2, f_M1], axis=0)
return np.mean(f_M1 * (f_Ni - f_M2), axis=0) / (V + (V == 0)) * (V != 0)

# Adapted from SALib
@staticmethod
def _total_order(f_M2, f_M1, f_Ni):
"""Calculate total order sensitivity indices. See also:
Expand Down
1 change: 1 addition & 0 deletions easyvvuq/collate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .base import BaseCollationElement
from .aggregate_samples import AggregateSamples
from .aggregate_by_variables import AggregateByVariables
from .aggregate_hdf5 import AggregateHDF5
157 changes: 157 additions & 0 deletions easyvvuq/collate/aggregate_hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Provides an element for aggregation of results from all complete runs.
"""

import os
from .base import BaseCollationElement
from easyvvuq import OutputType, constants
import h5py

__copyright__ = """

Copyright 2018 Robin A. Richardson, David W. Wright

This file is part of EasyVVUQ

EasyVVUQ is free software: you can redistribute it and/or modify
it under the terms of the Lesser GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

EasyVVUQ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Lesser GNU General Public License for more details.

You should have received a copy of the Lesser GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.

"""
__license__ = "LGPL"


class AggregateHDF5(BaseCollationElement, collater_name="aggregate_hdf5"):
"""
Aggregate the results of all completed simulations described by the
Campaign. Will simply concatenate all simulation data into one big
HDF5 DataFrame. This dataframe is stored in the campaign directory.

"""

def collate(self, campaign, app_id):
"""
Collected the decoded run results for all completed runs with ENCODED status

Parameters
----------
campaign : :obj:`easyvvuq.campaign.Campaign`
EasyVVUQ coordination object from which to get information on runs
to be collated.

Returns
-------
`int`:
The number of new data rows added during collation
"""

print('Collating data...')
decoder = campaign._active_app_decoder

if decoder.output_type != OutputType.SAMPLE:
raise RuntimeError('Can only aggregate sample type data')

# Aggregate any uncollated runs into a dataframe (for appending to existing full df)
new_data = {}
qoi_cols = campaign._active_app_decoder.output_columns

# Loop through all runs with status ENCODED (and therefore not yet COLLATED)
processed_run_IDs = []
for run_id, run_info in campaign.campaign_db.runs(
status=constants.Status.ENCODED, app_id=app_id):

# Use decoder to check if run has completed (in general application-specific)
if decoder.sim_complete(run_info=run_info):

run_data = decoder.parse_sim_output(run_info=run_info)
new_data[run_id] = {}
for qoi in qoi_cols:
new_data[run_id][qoi] = run_data[qoi].values
processed_run_IDs.append(run_id)

self.append_data(campaign, new_data)
campaign.campaign_db.set_run_statuses(processed_run_IDs, constants.Status.COLLATED)
print('done.')

return len(processed_run_IDs)

def append_data(self, campaign, new_data):
"""

Parameters
----------
campaign : EasyVVUQ campaign object
new_data : a dictionary containing the new simulation results for
each quantity of interest. The results are appended to the HDF5 file
samples.hdf5, located in the Campaign work directory

Returns
-------
None.

"""
hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5')
#retrieve the names of the quantities of interest
qoi_cols = campaign._active_app_decoder.output_columns

h5f = h5py.File(hdf5_file, 'a')
for qoi in qoi_cols:
#Each QoI is stored in a separate group
if not qoi in h5f.keys():
h5f.create_group(qoi)
#Each QoI sample is stored in a dataset qoi/run_id
for run_id in new_data.keys():
h5f.create_dataset(qoi + '/' + run_id, data=new_data[run_id][qoi])
h5f.close()

def get_collated_dataframe(self, campaign, app_id):
"""
Read the samples from the HDF5 file samples.hdf5, and store them in
a dictionary to be used in the analysis classes.

Parameters
----------
campaign : EasyVVUQ Campaign object
app_id : the EasyVVUQ app id.

Returns
-------
data : the simulation samples stored in a dictionary. Format of the
dict is data[qoi][run_id], e.g. data['velocity']['Run_1']

"""

hdf5_file = os.path.join(campaign.campaign_dir, 'samples.hdf5')
qoi_cols = campaign._active_app_decoder.output_columns
h5f = h5py.File(hdf5_file, 'r')
data = {}
for qoi in qoi_cols:
data[qoi] = {}
for run_id in h5f[qoi].keys():
data[qoi][run_id] = h5f[qoi + '/' + run_id][()]
h5f.close()
return data

def element_version(self):
return "0.1"

def is_restartable(self):
return True

def get_restart_dict(self):
"""Return dict required for restart from serlialized form.

Returns
-------
dict:
No restart parameters required
"""
return {}
2 changes: 1 addition & 1 deletion easyvvuq/sampling/mc_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def saltelli(self, n_mc):
self.xi_mc[(step - 1):self.max_num:step] = M_1
# store N_i entries between M2 and M1
for i in range(self.n_params):
N_i = np.array(M_2)
N_i = np.copy(M_2)
# N_i = M2 with i-th colum from M1
N_i[:, i] = M_1[:, i]
self.xi_mc[(i + 1):self.max_num:step] = N_i
Expand Down
Loading