Understanding doepipeline on simple example #33

bgriffen · 2019-07-23T06:01:01Z

Great package. I've just begun tinkering at a very low level to make sure my intuition reflects what the code is doing. I've set a two factor experiment with a normal distribution for a response. For example;

# standard
import pandas as pd
import sys,os
import pylab as plt
import numpy as np

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.colors as colors
import matplotlib.cm as cmx

# stats
from scipy.stats import multivariate_normal

# doe
from doepipeline.designer import ExperimentDesigner 
import doepipeline

# logging

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def generate_response(dfi,rv):
    return 1000*rv.pdf(dfi)

number_of_iterations = 5 # number of iterations to try

mu_x = 60
variance_x = 500

mu_y = 75
variance_y = 500

minx = 40
maxx = 100
miny = 45
maxy = 110

responses = {"fraction":{"criterion":"maximize"}}

factors = {
            "A": {
                    "min":10,
                    "max": 150,
                    "low_init": minx,
                    "high_init": maxx,
                },
            "B": {
                    "min":10,
                    "max":150,
                    "low_init":miny,
                    "high_init":maxy,
                     }
            } 

# create grid and multivariate normal

x = np.linspace(minx,maxx,100)
y = np.linspace(miny,maxy,100)
X, Y = np.meshgrid(x,y)
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X; pos[:, :, 1] = Y
rv = multivariate_normal([mu_x, mu_y], [[variance_x, 0], [0, variance_y]])

Z = generate_response(pos,rv)

# view response surface for factors A & B

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X,Y,Z)
ax.set_xlabel("A")
ax.set_ylabel("B")
ax.set_zlabel("response")
fig.savefig("response.png",bbox_inches='tight',dpi=300)
plt.show()

I then now want to iteratively optimize as per the doepiepline to hopefully converge on the solution set by mu_x and mu_y.

exp = ExperimentDesigner(factors,
                        'fullfactorial2levels',
                        responses,
                        model_selection='greedy',
                        skip_screening=True,
                        shrinkage=0.9)

# create first design (skipping screening)

df = exp.new_design()

dfstart = df.copy()
factors =[]
optimal = []
designs = []
best = []
designs.append(df)

for niters in range(number_of_iterations):
    r_0 = generate_response(df,rv)
    fractioni = pd.DataFrame.from_dict({"fraction":r_0})
    bi = exp.get_best_experiment(df,fractioni)
    fi = exp.update_factors_from_optimum(bi)
    opti = exp.get_optimal_settings(fractioni)
    df = exp.new_design()
    best.append(bi)
    factors.append(fi)
    designs.append(df)
    print("Iteration",niters+1)

Now I create a simple function to loop through each experimental design at each step. I made one or two modifications to return the model and optima to inspect interactively in my notebook.

def plot_search(dflist,Zin,optima,expi):
    fig,ax = plt.subplots(figsize=(10,8))
    falpha = np.linspace(0.3,0.9,len(dflist))[::-1]
    falphar = np.linspace(0.3,0.9,len(dflist))
    plt.plot(dfstart['A'],dfstart['B'],'rs',ms=5,label='start')

    jet = cm = plt.get_cmap('viridis') 
    cNorm  = colors.Normalize(vmin=0, vmax=len(dflist))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
    xyb = []
    ax.plot(mu_x,mu_y,'yo',label='peak response (optimal)')
    cbar = ax.imshow(Z,origin='lower',cmap='Blues',extent=[minx,maxx,miny,maxy],aspect='auto')

    for idx,dfin in enumerate(dflist):

        # set rectangle locations based on new design

        width = np.max(dfin["A"])-np.min(dfin["A"])
        height = np.max(dfin["B"])-np.min(dfin["B"])
        corner_y = np.min(dfin["B"])
        corner_x = np.min(dfin["A"])
        colorVal = scalarMap.to_rgba(idx)
        if idx != 0:
            rect = plt.Rectangle((corner_x,corner_y),width,height,linewidth=3,edgecolor=colorVal,facecolor='none')
        else:
            rect = plt.Rectangle((corner_x,corner_y),width,height,ls='--',linewidth=2,edgecolor='r',facecolor='none')
        
        ax.add_patch(rect)

        # obtain the best factors

        xyb.append([best[idx-1]['factor_settings']["A"],best[idx-1]['factor_settings']["B"]])
        
        # label the iteration in the center of each square

        ax.text(corner_x+width/2.,corner_y+height/2.,idx+1,fontsize=15,fontweight='bold',color=colorVal,va='center',ha='center')

    xyb = np.array(xyb)
    best_exp = expi._best_experiment
    plt.plot(best_exp['optimal_x']['A'],best_exp['optimal_x']['B'],'gx',ms=15,label='best')
    ax.set_xlabel("A")
    ax.set_ylabel("B")
    plt.colorbar(cbar)
    plt.xlim([minx,maxx])
    plt.ylim([miny,maxy])
    plt.subplots_adjust(left=0.1,right=0.99,top=0.9,bottom=0.1)
    plt.savefig("search.png",bbox_inches='tight',dpi=300)
    plt.grid()

# plot each experimental design proposed
plot_search(designs,Z,[mu_x,mu_y],exp)

r_0 = generate_response(df,rv)
fractioni = pd.DataFrame.from_dict({"fraction":r_0})
dfoptimal,model,prediction = exp.get_optimal_settings(fractioni)
plt.legend(loc='best')
plt.savefig("search.png",bbox_inches='tight',dpi=300)
plt.show()

The dashed line is where the first initial guess is then each 1,2,3,4,5,6 is each iteration. The yellow dot being the hard set desired optimal to be found.

A few questions:

At each iteration, does doepipeline 'forget' what has been run before hand and treats each new run separately? I would have thought that all responses to previous runs would be run through the OLS regression to propose new runs. In this case should I just concatenate data frames to grow the experimental runs being fed in each time to exp.get_best_experiment?
It seems that get_best_experiment and get_optimal get_optimal_settings only can provide experimental conditions that have already been tested and not actually interpolated optimal responses? I would expect after maybe three or four iterations for the system to predict the optimal to be closer to the true optimal, no? I guess I'm a bit confused over the nomenclature.
Is this response surface appropriate for doepipeline as a test? It seems that despite the model.summary() being quite desirable, it still didn't converge (see below).
Should I be using shrinking in this way? It seems like the only way to converge on anything as the other approaches seem to keep just generate new, slightly different experimental parameters for A, B.
Inside designer.py at _new_optimization_design(self) I can't seem to see how it uses OLS to generate a new set of experimental settings, at least in my intuitive arrangement here:

    fractioni = pd.DataFrame.from_dict({"fraction":r_0})
    bi = exp.get_best_experiment(df,fractioni)
    fi = exp.update_factors_from_optimum(bi)
    opti = exp.get_optimal_settings(fractioni)
    df = exp.new_design()

Lastly, I get the following:

Given mu_x = 60 and mu_y = 75, I'm just wondering if I've done something wrong, set it up incorrectly or taken the pipeline into an area it isn't best suited. Apologies for any silly errors, just trying to understand what's going on as the examples provided have a certain overhead to getting started. A very simple purely Pythonic example would be greatly appreciated. Thanks for any help you might provide.

The text was updated successfully, but these errors were encountered:

RickardSjogren · 2019-07-24T11:56:46Z

Hi @bgriffen, thank you for your interest and well-described questions! I will try to answer your questions below.

If I remember correctly the algorithm "forgets" previous runs. Since it uses least-squares modelling there is a risk that the model will be biased towards the region of data space where there are more observations, i.e. in the direction from where previous iterations where run. The purpose of statistical experimental designs is to eliminate bias and provide good support for the desired type of model.
The algorithm returns the best experiment that has been actually run. The update between iterations are done based on the "interpolated" predictions. The reason is that the curvature of the response surface probably does not perfectly match the model surface, meaning that we return the run that we have confirmed is the best.
Your model is a simple linear model that approximates the Gaussian surface very poorly since it's very curved. The "fullfactorial2levels" design does not support quadratic models. So my first choice would be to try a response surface design that allows for quadratic models (e.g. "ccc"). Hopefully that will improve the convergence of the algorithm in this toy example also.
Hopefully is this problem solved by my suggestion in 3. We have used this algorithm in-house for many different real-world applications for a number of years, and find that it converges well.
To use OLS to model the response and find a predicted optimum predict_optimum is used. The limits of the design are updated based on the predicted optimum in update_factors_from_optimum. _new_optimization_design outputs a design based on the current state of the factors.

I hope this answers your questions.

bgriffen · 2019-07-25T21:09:24Z

Hi Richard -- thank you for the clarification. My understanding improves each day. Just a few additional comments/questions below if it's OK.

The algorithm returns the best experiment that has been actually run. The update between iterations are done based on the "interpolated" predictions. The reason is that the curvature of the response surface probably does not perfectly match the model surface, meaning that we return the run that we have confirmed is the best.

Indeed, I did think I was overstepping the mark with my multivariate normal distribution. Though, with a sufficiently large variance it should be easier to model with a fullfactorial2levels model. Indeed, the surface optimum predicted is 0.01562 which is rather close to the actual optimum of 0.01592. Though the values of A and B predicted however are quite out - A ~ 71 (actual = 60) and B ~ 90.5 (actual 75). Though again, ff2level might not cut it, even for large variances in on the distribution (updated below).

Your model is a simple linear model that approximates the Gaussian surface very poorly since it's very curved. The "fullfactorial2levels" design does not support quadratic models. So my first choice would be to try a response surface design that allows for quadratic models (e.g. "ccc"). Hopefully that will improve the convergence of the algorithm in this toy example also.

OK, I'll experiment with the other designs. The trouble I face is using a design which can be implemented physically vs. what the response surface function may be (unknown a priori). I need to actually perform these experiments and physically, not in-silico so two level ff or three level ff is easier to arrange but may not be the best given ccc allows quadratic responses to be well described.

On that though, if I have historic data that doesn't fit the current design, e.g. its a sparse sampling, can that data still be used to initialize the search? Or do you have to set up the whole pipeline "clean" and carry out the experimental designs as generated by fullfactorial2levels and attach their responses once measured? Additionally, once say I perform the fullfactorial2level runs, can I then change the design mid-way through optimization to get a better functional fit/prediction of optima.

To use OLS to model the response and find a predicted optimum predict_optimum is used. The limits of the design are updated based on the predicted optimum in update_factors_from_optimum. _new_optimization_design outputs a design based on the current state of the factors.

I guess what I poorly asked, was that I need to run a predict_optimum step first in order for the OLS model to be used to generate new experiments.

Just to double check -- my order of doing things is correct? I added a comment to lines under question below

exp = ExperimentDesigner(factors,
                        'ccc',
                        responses,
                        model_selection='greedy',
                        skip_screening=True,
                        shrinkage=0.9)

df = exp.new_design()

# single iteration e.g.
for niters in range(number_of_iterations):
    r_0 = generate_response(df,rv)
    fractioni = pd.DataFrame.from_dict({"fraction":r_0})
    bi = exp.get_best_experiment(df,fractioni) # get the best experiment of the run
    fi = exp.update_factors_from_optimum(bi) # it is strange that it is named "from_optimum" but the function received the result from exp.get_best_experiment?
    if fi[1]: # if converged, stop generating new experiments
            break
    dfoptimal,model,prediction = exp.get_optimal_settings(fractioni) # slight modification of what is returned originally set in designer.py

    # generate a new design based on responses and best guess optima
    df = exp.new_design()

I'm just struggling to get the order of get_best_experiment, get_optimal_settings, update_factors_from_optimum and new_design correct given the internal variables that are updated? I'm just trying to get the logic of the functions in the correct order given a response measured at in each loop (though synthetically generated here).

Lastly, if a factor's weight is ~0 to the prediction, is there a way to remove it when new_design() is run.

Thank you for your help. Much appreciated.

RickardSjogren added the question label Jul 24, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Understanding doepipeline on simple example #33

Understanding doepipeline on simple example #33

bgriffen commented Jul 23, 2019 •

edited

Loading

RickardSjogren commented Jul 24, 2019

bgriffen commented Jul 25, 2019 •

edited

Loading

Understanding doepipeline on simple example #33

Understanding doepipeline on simple example #33

Comments

bgriffen commented Jul 23, 2019 • edited Loading

RickardSjogren commented Jul 24, 2019

bgriffen commented Jul 25, 2019 • edited Loading

bgriffen commented Jul 23, 2019 •

edited

Loading

bgriffen commented Jul 25, 2019 •

edited

Loading