From 1920984cb85b61073c293b810444687b41987679 Mon Sep 17 00:00:00 2001 From: leschultz Date: Mon, 22 Apr 2024 14:43:08 -0500 Subject: [PATCH] Updated documentation --- docs/multilearn/datasets.html | 448 +++++++++++++++++++ docs/multilearn/index.html | 75 ++++ docs/multilearn/models.html | 272 ++++++++++++ docs/multilearn/plots.html | 617 ++++++++++++++++++++++++++ docs/multilearn/utils.html | 790 ++++++++++++++++++++++++++++++++++ setup.py | 2 +- src/multilearn/datasets.py | 36 ++ src/multilearn/models.py | 14 + src/multilearn/plots.py | 41 +- src/multilearn/utils.py | 60 +++ 10 files changed, 2345 insertions(+), 10 deletions(-) create mode 100644 docs/multilearn/datasets.html create mode 100644 docs/multilearn/index.html create mode 100644 docs/multilearn/models.html create mode 100644 docs/multilearn/plots.html create mode 100644 docs/multilearn/utils.html diff --git a/docs/multilearn/datasets.html b/docs/multilearn/datasets.html new file mode 100644 index 0000000..6137c82 --- /dev/null +++ b/docs/multilearn/datasets.html @@ -0,0 +1,448 @@ + + + + + + +multilearn.datasets API documentation + + + + + + + + + + + +
+
+
+

Module multilearn.datasets

+
+
+
+ +Expand source code + +
from sklearn.model_selection import train_test_split
+
+import pandas as pd
+import numpy as np
+
+import pkg_resources
+import os
+
+data_path = pkg_resources.resource_filename('multilearn', 'data')
+
+
+def splitter(X, y, names=None, train_size=1.0, val_size=0.0, test_size=0.0):
+    '''
+    Split list of data into train, validation, and test splits.
+
+    Args:
+        X (list): A list of features.
+        y (list): A list of target values.
+        names (list): A list of names for each dataset.
+        train_size (float): The fraction of training data.
+        val_size (float): The fraction of validation data.
+        test_size (float): The fraction of test data.
+
+    Returns:
+        dict: A dictionary of data splits.
+    '''
+
+    n = len(X)
+    if names is None:
+        assert n == len(y)
+    else:
+        assert n == len(y) == len(names)
+
+    data = {}
+    for i in range(n):
+        d = split(X[i], y[i], train_size, val_size, test_size)
+
+        if names is None:
+            data[i] = d
+        else:
+            data[names[i]] = d
+
+    return data
+
+
+def split(X, y, train_size=1.0, val_size=0.0, test_size=0.0):
+    '''
+    Split data into train, validation, and test splits.
+
+    Args:
+        X (np.ndarray): A list of features.
+        y (np.ndarray): A list of target values.
+        train_size (float): The fraction of training data.
+        val_size (float): The fraction of validation data.
+        test_size (float): The fraction of test data.
+
+    Returns:
+        dict: A dictionary of data splits.
+    '''
+
+    # Make sure data splits sum to 1
+    assert train_size+val_size+test_size == 1.0, (
+        'Split fractions must sum to 1'
+    )
+
+    if train_size+val_size < 1.0:
+        test_size = 1.0-train_size-val_size
+
+    elif train_size+test_size < 1.0:
+        val_size = 1.0-train_size-test_size
+
+    elif val_size+test_size < 1.0:
+        train_size = 1.0-val_size+test_size
+
+    # Now split data as needed
+    data = {}
+    if train_size == 1.0:
+        data['X_train'] = X
+        data['y_train'] = y
+
+    else:
+
+        splits = train_test_split(X, y, train_size=train_size)
+        X_train, X_test, y_train, y_test = splits
+
+        data['X_train'] = X_train
+        data['y_train'] = y_train
+
+        if train_size+val_size == 1.0:
+            data['X_val'] = X_test
+            data['y_val'] = y_test
+
+        elif train_size+test_size == 1.0:
+            data['X_test'] = X_test
+            data['y_test'] = y_test
+
+        else:
+            splits = train_test_split(
+                                      X_test,
+                                      y_test,
+                                      test_size=test_size/(test_size+val_size),
+                                      )
+            X_val, X_test, y_val, y_test = splits
+            data['X_val'] = X_val
+            data['y_val'] = y_val
+            data['X_test'] = X_test
+            data['y_test'] = y_test
+
+    return data
+
+
+def load(names):
+    '''
+    Load data included with the package.
+
+    Args:
+        names (list): A list of data to load.
+
+    Returns:
+        Tuple[list, list]: A tuple of lists of features and target variables.
+    '''
+
+    Xs = []
+    ys = []
+    for name in names:
+
+        if name == 'toy1':
+
+            X = np.random.uniform(size=(1000, 3))
+            y = 3+X[:, 0]+X[:, 1]**3+np.log(X[:, 2])
+
+        elif name == 'toy2':
+
+            X = np.random.uniform(-100, 50, size=(900, 3))
+            y = 3+X[:, 0]+X[:, 1]**3+X[:, 2]
+
+        elif name == 'friedman1':
+
+            X = np.random.uniform(size=(500, 5))
+            y = (
+                 10*np.sin(np.pi*X[:, 0]*X[:, 1])
+                 + 20*(X[:, 2]-0.5)**2
+                 + 10*X[:, 3]
+                 + 5*X[:, 4]
+                 )
+
+        else:
+            path = os.path.join(data_path, f'{name}.csv')
+            df = pd.read_csv(path)
+
+            y = df['y'].values
+            X = df.drop('y', axis=1).values
+
+        Xs.append(X)
+        ys.append(y)
+
+    return Xs, ys
+
+
+
+
+
+
+
+

Functions

+
+
+def load(names) +
+
+

Load data included with the package.

+

Args

+
+
names : list
+
A list of data to load.
+
+

Returns

+
+
Tuple[list, list]
+
A tuple of lists of features and target variables.
+
+
+ +Expand source code + +
def load(names):
+    '''
+    Load data included with the package.
+
+    Args:
+        names (list): A list of data to load.
+
+    Returns:
+        Tuple[list, list]: A tuple of lists of features and target variables.
+    '''
+
+    Xs = []
+    ys = []
+    for name in names:
+
+        if name == 'toy1':
+
+            X = np.random.uniform(size=(1000, 3))
+            y = 3+X[:, 0]+X[:, 1]**3+np.log(X[:, 2])
+
+        elif name == 'toy2':
+
+            X = np.random.uniform(-100, 50, size=(900, 3))
+            y = 3+X[:, 0]+X[:, 1]**3+X[:, 2]
+
+        elif name == 'friedman1':
+
+            X = np.random.uniform(size=(500, 5))
+            y = (
+                 10*np.sin(np.pi*X[:, 0]*X[:, 1])
+                 + 20*(X[:, 2]-0.5)**2
+                 + 10*X[:, 3]
+                 + 5*X[:, 4]
+                 )
+
+        else:
+            path = os.path.join(data_path, f'{name}.csv')
+            df = pd.read_csv(path)
+
+            y = df['y'].values
+            X = df.drop('y', axis=1).values
+
+        Xs.append(X)
+        ys.append(y)
+
+    return Xs, ys
+
+
+
+def split(X, y, train_size=1.0, val_size=0.0, test_size=0.0) +
+
+

Split data into train, validation, and test splits.

+

Args

+
+
X : np.ndarray
+
A list of features.
+
y : np.ndarray
+
A list of target values.
+
train_size : float
+
The fraction of training data.
+
val_size : float
+
The fraction of validation data.
+
test_size : float
+
The fraction of test data.
+
+

Returns

+
+
dict
+
A dictionary of data splits.
+
+
+ +Expand source code + +
def split(X, y, train_size=1.0, val_size=0.0, test_size=0.0):
+    '''
+    Split data into train, validation, and test splits.
+
+    Args:
+        X (np.ndarray): A list of features.
+        y (np.ndarray): A list of target values.
+        train_size (float): The fraction of training data.
+        val_size (float): The fraction of validation data.
+        test_size (float): The fraction of test data.
+
+    Returns:
+        dict: A dictionary of data splits.
+    '''
+
+    # Make sure data splits sum to 1
+    assert train_size+val_size+test_size == 1.0, (
+        'Split fractions must sum to 1'
+    )
+
+    if train_size+val_size < 1.0:
+        test_size = 1.0-train_size-val_size
+
+    elif train_size+test_size < 1.0:
+        val_size = 1.0-train_size-test_size
+
+    elif val_size+test_size < 1.0:
+        train_size = 1.0-val_size+test_size
+
+    # Now split data as needed
+    data = {}
+    if train_size == 1.0:
+        data['X_train'] = X
+        data['y_train'] = y
+
+    else:
+
+        splits = train_test_split(X, y, train_size=train_size)
+        X_train, X_test, y_train, y_test = splits
+
+        data['X_train'] = X_train
+        data['y_train'] = y_train
+
+        if train_size+val_size == 1.0:
+            data['X_val'] = X_test
+            data['y_val'] = y_test
+
+        elif train_size+test_size == 1.0:
+            data['X_test'] = X_test
+            data['y_test'] = y_test
+
+        else:
+            splits = train_test_split(
+                                      X_test,
+                                      y_test,
+                                      test_size=test_size/(test_size+val_size),
+                                      )
+            X_val, X_test, y_val, y_test = splits
+            data['X_val'] = X_val
+            data['y_val'] = y_val
+            data['X_test'] = X_test
+            data['y_test'] = y_test
+
+    return data
+
+
+
+def splitter(X, y, names=None, train_size=1.0, val_size=0.0, test_size=0.0) +
+
+

Split list of data into train, validation, and test splits.

+

Args

+
+
X : list
+
A list of features.
+
y : list
+
A list of target values.
+
names : list
+
A list of names for each dataset.
+
train_size : float
+
The fraction of training data.
+
val_size : float
+
The fraction of validation data.
+
test_size : float
+
The fraction of test data.
+
+

Returns

+
+
dict
+
A dictionary of data splits.
+
+
+ +Expand source code + +
def splitter(X, y, names=None, train_size=1.0, val_size=0.0, test_size=0.0):
+    '''
+    Split list of data into train, validation, and test splits.
+
+    Args:
+        X (list): A list of features.
+        y (list): A list of target values.
+        names (list): A list of names for each dataset.
+        train_size (float): The fraction of training data.
+        val_size (float): The fraction of validation data.
+        test_size (float): The fraction of test data.
+
+    Returns:
+        dict: A dictionary of data splits.
+    '''
+
+    n = len(X)
+    if names is None:
+        assert n == len(y)
+    else:
+        assert n == len(y) == len(names)
+
+    data = {}
+    for i in range(n):
+        d = split(X[i], y[i], train_size, val_size, test_size)
+
+        if names is None:
+            data[i] = d
+        else:
+            data[names[i]] = d
+
+    return data
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/multilearn/index.html b/docs/multilearn/index.html new file mode 100644 index 0000000..299796e --- /dev/null +++ b/docs/multilearn/index.html @@ -0,0 +1,75 @@ + + + + + + +multilearn API documentation + + + + + + + + + + + +
+ + +
+ + + \ No newline at end of file diff --git a/docs/multilearn/models.html b/docs/multilearn/models.html new file mode 100644 index 0000000..7d73f43 --- /dev/null +++ b/docs/multilearn/models.html @@ -0,0 +1,272 @@ + + + + + + +multilearn.models API documentation + + + + + + + + + + + +
+
+
+

Module multilearn.models

+
+
+
+ +Expand source code + +
from torch import nn
+
+
+class MultiNet(nn.Module):
+    '''
+    A general model for building multi-target learning NNs.
+    Each separation of layers is symmetric across input datasets.
+    '''
+
+    def __init__(
+                 self,
+                 input_arch={},
+                 mid_arch={64: 1, 32: 1},
+                 out_arch={},
+                 tasks=[0],
+                 ):
+
+        super(MultiNet, self).__init__()
+
+        def make_layers(arch, is_out=False):
+
+            hidden = nn.ModuleList()
+            for neurons, layers in arch.items():
+                for i in range(layers):
+                    hidden.append(nn.LazyLinear(neurons))
+                    hidden.append(nn.LeakyReLU())
+
+            if is_out:
+                hidden.append(nn.LazyLinear(1))
+
+            hidden = nn.Sequential(*hidden)
+
+            return hidden
+
+        def separate(arch, tasks, is_out=False):
+
+            separate = nn.ModuleDict()
+            for t in tasks:
+                i = make_layers(arch, is_out)
+                separate[t] = i
+
+            return separate
+
+        self.input = separate(input_arch, tasks)
+        self.mid = make_layers(mid_arch)
+        self.out = separate(out_arch, tasks, True)
+
+    def forward(self, x, prop):
+        '''
+        Use a model to predict.
+
+        Args:
+            x (nn.tensor): The features.
+            prop: The property to predict.
+
+        Returns:
+            torch.FloatTensor: The predicted target value.
+        '''
+
+        for i in self.input[prop]:
+            x = i(x)
+
+        for i in self.mid:
+            x = i(x)
+
+        for i in self.out[prop]:
+            x = i(x)
+
+        return x
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class MultiNet +(input_arch={}, mid_arch={64: 1, 32: 1}, out_arch={}, tasks=[0]) +
+
+

A general model for building multi-target learning NNs. +Each separation of layers is symmetric across input datasets.

+

Initialize internal Module state, shared by both nn.Module and ScriptModule.

+
+ +Expand source code + +
class MultiNet(nn.Module):
+    '''
+    A general model for building multi-target learning NNs.
+    Each separation of layers is symmetric across input datasets.
+    '''
+
+    def __init__(
+                 self,
+                 input_arch={},
+                 mid_arch={64: 1, 32: 1},
+                 out_arch={},
+                 tasks=[0],
+                 ):
+
+        super(MultiNet, self).__init__()
+
+        def make_layers(arch, is_out=False):
+
+            hidden = nn.ModuleList()
+            for neurons, layers in arch.items():
+                for i in range(layers):
+                    hidden.append(nn.LazyLinear(neurons))
+                    hidden.append(nn.LeakyReLU())
+
+            if is_out:
+                hidden.append(nn.LazyLinear(1))
+
+            hidden = nn.Sequential(*hidden)
+
+            return hidden
+
+        def separate(arch, tasks, is_out=False):
+
+            separate = nn.ModuleDict()
+            for t in tasks:
+                i = make_layers(arch, is_out)
+                separate[t] = i
+
+            return separate
+
+        self.input = separate(input_arch, tasks)
+        self.mid = make_layers(mid_arch)
+        self.out = separate(out_arch, tasks, True)
+
+    def forward(self, x, prop):
+        '''
+        Use a model to predict.
+
+        Args:
+            x (nn.tensor): The features.
+            prop: The property to predict.
+
+        Returns:
+            torch.FloatTensor: The predicted target value.
+        '''
+
+        for i in self.input[prop]:
+            x = i(x)
+
+        for i in self.mid:
+            x = i(x)
+
+        for i in self.out[prop]:
+            x = i(x)
+
+        return x
+
+

Ancestors

+
    +
  • torch.nn.modules.module.Module
  • +
+

Methods

+
+
+def forward(self, x, prop) ‑> Callable[..., Any] +
+
+

Use a model to predict.

+

Args

+
+
x : nn.tensor
+
The features.
+
prop
+
The property to predict.
+
+

Returns

+
+
torch.FloatTensor
+
The predicted target value.
+
+
+ +Expand source code + +
def forward(self, x, prop):
+    '''
+    Use a model to predict.
+
+    Args:
+        x (nn.tensor): The features.
+        prop: The property to predict.
+
+    Returns:
+        torch.FloatTensor: The predicted target value.
+    '''
+
+    for i in self.input[prop]:
+        x = i(x)
+
+    for i in self.mid:
+        x = i(x)
+
+    for i in self.out[prop]:
+        x = i(x)
+
+    return x
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/multilearn/plots.html b/docs/multilearn/plots.html new file mode 100644 index 0000000..cd85ee7 --- /dev/null +++ b/docs/multilearn/plots.html @@ -0,0 +1,617 @@ + + + + + + +multilearn.plots API documentation + + + + + + + + + + + +
+
+
+

Module multilearn.plots

+
+
+
+ +Expand source code + +
from matplotlib import pyplot as pl
+from sklearn import metrics
+
+import numpy as np
+
+import matplotlib
+import json
+import os
+
+# Font styles
+font = {'font.size': 16, 'lines.markersize': 10}
+matplotlib.rcParams.update(font)
+
+
+def plot_dump(data, fig, ax, save, legend=True):
+    '''
+    Function to dump figures.
+
+    Args:
+        data (dict): Data to dump in json file.
+        fig (object): Figure object.
+        ax (object): Axes object.
+        save (str): The location to save plot.
+    '''
+
+    fig.tight_layout()
+
+    if legend:
+
+        fig_legend, ax_legend = pl.subplots()
+        ax_legend.axis(False)
+
+        legend = ax_legend.legend(
+                                  *ax.get_legend_handles_labels(),
+                                  frameon=False,
+                                  loc='center',
+                                  bbox_to_anchor=(0.5, 0.5)
+                                  )
+
+        ax_legend.spines['top'].set_visible(False)
+        ax_legend.spines['bottom'].set_visible(False)
+        ax_legend.spines['left'].set_visible(False)
+        ax_legend.spines['right'].set_visible(False)
+
+        fig_legend.savefig(save+'_legend.png', bbox_inches='tight', dpi=400)
+
+        ax.legend([]).set_visible(False)
+
+        pl.close(fig_legend)
+
+    fig.savefig(save+'.png', bbox_inches='tight', dpi=400)
+
+    pl.close(fig)
+
+    with open(save+'.json', 'w') as handle:
+        json.dump(data, handle)
+
+
+def parity(y, y_pred, sigma_y, save, color):
+
+    '''
+    Make a parity plot.
+
+    Args:
+        y (np.ndarray): The true target variable.
+        y_pred (np.ndarray): The predicted target variable.
+        sigma_y (float): The standard deviation of y.
+        save (str): The directory to save plot.
+        color (str): The color of the plot.
+    '''
+
+    rmse = metrics.mean_squared_error(y, y_pred)**0.5
+
+    if y.shape[0] > 1:
+        rmse_sigma = rmse/sigma_y
+    else:
+        rmse_sigma = np.nan
+
+    mae = metrics.mean_absolute_error(y, y_pred)
+    r2 = metrics.r2_score(y, y_pred)
+
+    label = r'$RMSE/\sigma_{y}=$'
+    label += r'{:.2}'.format(rmse_sigma)
+    label += '\n'
+    label += r'$RMSE=$'
+    label += r'{:.2}'.format(rmse)
+    label += '\n'
+    label += r'$MAE=$'
+    label += r'{:.2}'.format(mae)
+    label += '\n'
+    label += r'$R^{2}=$'
+    label += r'{:.2}'.format(r2)
+
+    fig, ax = pl.subplots()
+
+    ax.scatter(
+               y,
+               y_pred,
+               marker='.',
+               zorder=2,
+               color=color,
+               label=label,
+               )
+
+    limits = []
+    min_range = min(min(y), min(y_pred))
+    max_range = max(max(y), max(y_pred))
+    span = max_range-min_range
+    limits.append(min_range-0.1*span)
+    limits.append(max_range+0.1*span)
+
+    # Line of best fit
+    ax.plot(
+            limits,
+            limits,
+            label=r'$y=\hat{y}$',
+            color='k',
+            linestyle=':',
+            zorder=1
+            )
+
+    ax.set_aspect('equal')
+    ax.set_xlim(limits)
+    ax.set_ylim(limits)
+
+    ax.set_ylabel(r'$\hat{y}$')
+    ax.set_xlabel('y')
+
+    h = 8
+    w = 8
+
+    fig.set_size_inches(h, w, forward=True)
+
+    data = {}
+    data[r'$RMSE$'] = float(rmse)
+    data[r'$RMSE/\sigma_{y}$'] = float(rmse_sigma)
+    data[r'$MAE$'] = float(mae)
+    data[r'$R^{2}$'] = float(r2)
+    data['y'] = y.tolist()
+    data['y_pred'] = y_pred.tolist()
+
+    plot_dump(data, fig, ax, save)
+
+
+def generate(
+             df_parity,
+             df_loss,
+             save='.',
+             ):
+
+    '''
+    Generate both parity and learning curve plots.
+
+    Args:
+        df_parity (pd.DataFrame): Parity plot data.
+        df_loss (pd.DataFrame): Learning curve data.
+        save (str): Location to save all outputs.
+    '''
+
+    for group, values in df_parity.groupby(['data', 'split']):
+
+        y = values['y']
+        sigma_y = y.std()
+        y = y.values
+        y_pred = values['p'].values
+
+        data_indx, data_set = group
+
+        if data_set == 'train':
+            color = 'g'
+        elif data_set == 'val':
+            color = 'b'
+        elif data_set == 'test':
+            color = 'r'
+
+        save_dir = os.path.join(*[save, f'{data_indx}', 'parity'])
+        os.makedirs(save_dir, exist_ok=True)
+        newsave = os.path.join(save_dir, '{}.png'.format(data_set))
+
+        parity(y, y_pred, sigma_y, newsave, color)
+
+    for group, values in df_loss.groupby(['data', 'split']):
+
+        x = values['epoch'].values
+        y = values['loss'].values
+
+        data_indx, data_set = group
+
+        if data_set == 'train':
+            color = 'g'
+        elif data_set == 'val':
+            color = 'b'
+        elif data_set == 'test':
+            color = 'r'
+
+        save_dir = os.path.join(*[save, f'{data_indx}', 'loss_vs_epoch'])
+        os.makedirs(save_dir, exist_ok=True)
+        newsave = os.path.join(save_dir, '{}.png'.format(data_set))
+
+        learning_curve(x, y, newsave, data_set, color)
+
+
+def learning_curve(x, y, save, group, color):
+    '''
+    Plot the loss versus the epoch.
+
+    Args:
+        x (list): The epochs.
+        y (list): The loss.
+        save (str): The save location.
+        group (str): The data set in question.
+        color (str): The plot color.
+    '''
+
+    # Regular plot
+    fig, ax = pl.subplots()
+
+    val = min(y)
+
+    label = '{}: lowest loss value: {:.2f}'.format(group.capitalize(), val)
+    label += '\n'
+    label += '{}: last loss value: {:.2f}'.format(group.capitalize(), y[-1])
+
+    ax.plot(
+            x,
+            y,
+            marker='.',
+            color=color,
+            label=label,
+            )
+
+    ax.set_xlabel('Epoch')
+    ax.set_ylabel('Loss')
+
+    data = {}
+    data['mae'] = y.tolist()
+    data['epoch'] = x.tolist()
+
+    plot_dump(data, fig, ax, save, False)
+
+
+
+
+
+
+
+

Functions

+
+
+def generate(df_parity, df_loss, save='.') +
+
+

Generate both parity and learning curve plots.

+

Args

+
+
df_parity : pd.DataFrame
+
Parity plot data.
+
df_loss : pd.DataFrame
+
Learning curve data.
+
save : str
+
Location to save all outputs.
+
+
+ +Expand source code + +
def generate(
+             df_parity,
+             df_loss,
+             save='.',
+             ):
+
+    '''
+    Generate both parity and learning curve plots.
+
+    Args:
+        df_parity (pd.DataFrame): Parity plot data.
+        df_loss (pd.DataFrame): Learning curve data.
+        save (str): Location to save all outputs.
+    '''
+
+    for group, values in df_parity.groupby(['data', 'split']):
+
+        y = values['y']
+        sigma_y = y.std()
+        y = y.values
+        y_pred = values['p'].values
+
+        data_indx, data_set = group
+
+        if data_set == 'train':
+            color = 'g'
+        elif data_set == 'val':
+            color = 'b'
+        elif data_set == 'test':
+            color = 'r'
+
+        save_dir = os.path.join(*[save, f'{data_indx}', 'parity'])
+        os.makedirs(save_dir, exist_ok=True)
+        newsave = os.path.join(save_dir, '{}.png'.format(data_set))
+
+        parity(y, y_pred, sigma_y, newsave, color)
+
+    for group, values in df_loss.groupby(['data', 'split']):
+
+        x = values['epoch'].values
+        y = values['loss'].values
+
+        data_indx, data_set = group
+
+        if data_set == 'train':
+            color = 'g'
+        elif data_set == 'val':
+            color = 'b'
+        elif data_set == 'test':
+            color = 'r'
+
+        save_dir = os.path.join(*[save, f'{data_indx}', 'loss_vs_epoch'])
+        os.makedirs(save_dir, exist_ok=True)
+        newsave = os.path.join(save_dir, '{}.png'.format(data_set))
+
+        learning_curve(x, y, newsave, data_set, color)
+
+
+
+def learning_curve(x, y, save, group, color) +
+
+

Plot the loss versus the epoch.

+

Args

+
+
x : list
+
The epochs.
+
y : list
+
The loss.
+
save : str
+
The save location.
+
group : str
+
The data set in question.
+
color : str
+
The plot color.
+
+
+ +Expand source code + +
def learning_curve(x, y, save, group, color):
+    '''
+    Plot the loss versus the epoch.
+
+    Args:
+        x (list): The epochs.
+        y (list): The loss.
+        save (str): The save location.
+        group (str): The data set in question.
+        color (str): The plot color.
+    '''
+
+    # Regular plot
+    fig, ax = pl.subplots()
+
+    val = min(y)
+
+    label = '{}: lowest loss value: {:.2f}'.format(group.capitalize(), val)
+    label += '\n'
+    label += '{}: last loss value: {:.2f}'.format(group.capitalize(), y[-1])
+
+    ax.plot(
+            x,
+            y,
+            marker='.',
+            color=color,
+            label=label,
+            )
+
+    ax.set_xlabel('Epoch')
+    ax.set_ylabel('Loss')
+
+    data = {}
+    data['mae'] = y.tolist()
+    data['epoch'] = x.tolist()
+
+    plot_dump(data, fig, ax, save, False)
+
+
+
+def parity(y, y_pred, sigma_y, save, color) +
+
+

Make a parity plot.

+

Args

+
+
y : np.ndarray
+
The true target variable.
+
y_pred : np.ndarray
+
The predicted target variable.
+
sigma_y : float
+
The standard deviation of y.
+
save : str
+
The directory to save plot.
+
color : str
+
The color of the plot.
+
+
+ +Expand source code + +
def parity(y, y_pred, sigma_y, save, color):
+
+    '''
+    Make a parity plot.
+
+    Args:
+        y (np.ndarray): The true target variable.
+        y_pred (np.ndarray): The predicted target variable.
+        sigma_y (float): The standard deviation of y.
+        save (str): The directory to save plot.
+        color (str): The color of the plot.
+    '''
+
+    rmse = metrics.mean_squared_error(y, y_pred)**0.5
+
+    if y.shape[0] > 1:
+        rmse_sigma = rmse/sigma_y
+    else:
+        rmse_sigma = np.nan
+
+    mae = metrics.mean_absolute_error(y, y_pred)
+    r2 = metrics.r2_score(y, y_pred)
+
+    label = r'$RMSE/\sigma_{y}=$'
+    label += r'{:.2}'.format(rmse_sigma)
+    label += '\n'
+    label += r'$RMSE=$'
+    label += r'{:.2}'.format(rmse)
+    label += '\n'
+    label += r'$MAE=$'
+    label += r'{:.2}'.format(mae)
+    label += '\n'
+    label += r'$R^{2}=$'
+    label += r'{:.2}'.format(r2)
+
+    fig, ax = pl.subplots()
+
+    ax.scatter(
+               y,
+               y_pred,
+               marker='.',
+               zorder=2,
+               color=color,
+               label=label,
+               )
+
+    limits = []
+    min_range = min(min(y), min(y_pred))
+    max_range = max(max(y), max(y_pred))
+    span = max_range-min_range
+    limits.append(min_range-0.1*span)
+    limits.append(max_range+0.1*span)
+
+    # Line of best fit
+    ax.plot(
+            limits,
+            limits,
+            label=r'$y=\hat{y}$',
+            color='k',
+            linestyle=':',
+            zorder=1
+            )
+
+    ax.set_aspect('equal')
+    ax.set_xlim(limits)
+    ax.set_ylim(limits)
+
+    ax.set_ylabel(r'$\hat{y}$')
+    ax.set_xlabel('y')
+
+    h = 8
+    w = 8
+
+    fig.set_size_inches(h, w, forward=True)
+
+    data = {}
+    data[r'$RMSE$'] = float(rmse)
+    data[r'$RMSE/\sigma_{y}$'] = float(rmse_sigma)
+    data[r'$MAE$'] = float(mae)
+    data[r'$R^{2}$'] = float(r2)
+    data['y'] = y.tolist()
+    data['y_pred'] = y_pred.tolist()
+
+    plot_dump(data, fig, ax, save)
+
+
+
+def plot_dump(data, fig, ax, save, legend=True) +
+
+

Function to dump figures.

+

Args

+
+
data : dict
+
Data to dump in json file.
+
fig : object
+
Figure object.
+
ax : object
+
Axes object.
+
save : str
+
The location to save plot.
+
+
+ +Expand source code + +
def plot_dump(data, fig, ax, save, legend=True):
+    '''
+    Function to dump figures.
+
+    Args:
+        data (dict): Data to dump in json file.
+        fig (object): Figure object.
+        ax (object): Axes object.
+        save (str): The location to save plot.
+    '''
+
+    fig.tight_layout()
+
+    if legend:
+
+        fig_legend, ax_legend = pl.subplots()
+        ax_legend.axis(False)
+
+        legend = ax_legend.legend(
+                                  *ax.get_legend_handles_labels(),
+                                  frameon=False,
+                                  loc='center',
+                                  bbox_to_anchor=(0.5, 0.5)
+                                  )
+
+        ax_legend.spines['top'].set_visible(False)
+        ax_legend.spines['bottom'].set_visible(False)
+        ax_legend.spines['left'].set_visible(False)
+        ax_legend.spines['right'].set_visible(False)
+
+        fig_legend.savefig(save+'_legend.png', bbox_inches='tight', dpi=400)
+
+        ax.legend([]).set_visible(False)
+
+        pl.close(fig_legend)
+
+    fig.savefig(save+'.png', bbox_inches='tight', dpi=400)
+
+    pl.close(fig)
+
+    with open(save+'.json', 'w') as handle:
+        json.dump(data, handle)
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/multilearn/utils.html b/docs/multilearn/utils.html new file mode 100644 index 0000000..ea10f45 --- /dev/null +++ b/docs/multilearn/utils.html @@ -0,0 +1,790 @@ + + + + + + +multilearn.utils API documentation + + + + + + + + + + + +
+
+
+

Module multilearn.utils

+
+
+
+ +Expand source code + +
from lightning.pytorch.utilities.combined_loader import CombinedLoader
+from torch.utils.data import DataLoader, TensorDataset
+from multilearn import plots
+from joblib import dump
+
+import pandas as pd
+import numpy as np
+
+import torch
+import copy
+import dill
+import os
+
+# Chose defalut device
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+def save(
+         model,
+         df_parity,
+         df_loss,
+         data,
+         save_dir='./outputs',
+         ):
+
+    '''
+    Save results of run.
+
+    Args:
+        model (object): The trained tensorflow model.
+        df_parity (pd.DataFrame): The parity plot data.
+        df_loss (pd.DataFrame): The learning curve data.
+        data (dict): The data splits.
+        save_dir (str): The location to save outputs.
+    '''
+
+    os.makedirs(save_dir, exist_ok=True)
+
+    plots.generate(df_parity, df_loss, save_dir)
+
+    torch.save(
+               model,
+               os.path.join(save_dir, 'model.pth')
+               )
+
+    df_parity.to_csv(os.path.join(save_dir, 'predictions.csv'), index=False)
+    df_loss.to_csv(os.path.join(save_dir, 'loss_vs_epochs.csv'), index=False)
+
+    for key, value in data.items():
+
+        new_dir = os.path.join(save_dir, key)
+        for k, v in value.items():
+            if k == 'scaler':
+                dump(v, os.path.join(new_dir, 'scaler.joblib'))
+
+            elif k == 'loss':
+                dill.dump(
+                          v,
+                          open(os.path.join(new_dir, 'loss.pkl'), 'wb'),
+                          )
+
+            elif ('X_' in k) or ('y_' in k):
+
+                if 'X_' in k:
+                    v = v.cpu().detach()
+
+                np.savetxt(os.path.join(
+                                        new_dir,
+                                        f'{k}.csv',
+                                        ), v, delimiter=',')
+
+
+def to_tensor(x):
+    '''
+    Convert variable to tensor.
+
+    Args:
+        x (np.ndarray): The variable to convert.
+
+    Returns:
+        torch.FloatTensor: The converted variable.
+    '''
+
+    y = torch.FloatTensor(x).to(device)
+
+    if len(y.shape) < 2:
+        y = y.reshape(-1, 1)
+
+    return y
+
+
+def loader(X, y, batch_size=32, shuffle=True):
+    '''
+    A wrapper to load data for pytorch.
+
+    Args:
+        X (torch.FloatTensor): The features.
+        y (torch.FloatTensor): The target values.
+        batch_size (int): The size of the batch for gradient descent.
+        shuffle (bool): Whether to shuffle data.
+
+    Returns:
+        torch.utils.data.DataLoader: The data loader.
+    '''
+
+    data = TensorDataset(X, y)
+    data = DataLoader(
+                      data,
+                      batch_size=batch_size,
+                      shuffle=shuffle,
+                      )
+
+    return data
+
+
+def pred(model, data):
+    '''
+    Function to generate parity plot data predictions.
+
+    Args:
+        model (object): The trained model.
+        data (dict): The data splits.
+
+    Returns:
+        pd.DataFrame: Parity plot data.
+    '''
+
+    df = []
+    with torch.no_grad():
+        for key, value in data.items():
+
+            for k, v in value.items():
+
+                if 'X_' in k:
+                    split = k.split('_')[1]
+                    X = value[k]
+                    y = value['y_'+split]
+                    d = pd.DataFrame()
+                    d['y'] = y.cpu().detach().view(-1)
+                    d['p'] = model(X, key).cpu().detach().view(-1)
+                    d['data'] = key
+                    d['split'] = split
+                    df.append(d)
+
+    df = pd.concat(df)
+
+    return df
+
+
+def train(
+          model,
+          optimizer,
+          data,
+          n_epochs=1000,
+          batch_size=32,
+          lr=1e-4,
+          save_dir='outputs',
+          patience=np.inf,
+          print_n=100,
+          ):
+    '''
+    The training workflow for models.
+
+    Args:
+        model (object): The model to train/assess.
+        optimizer (object): The torch optimizer
+        data (dict): The data with splits.
+        n_epochs (int): The number of epochs to train.
+        batch_size (int): The size of the batch for gradient descent.
+        lr (float): The learning rate.
+        save_dir (str): The location to save outputs.
+        patience (int): Stop training if no improvement after n epochs.
+        print_n (int): The interval to print loss.
+
+    Returns:
+        dict: The trained model and plot data.
+    '''
+
+    # Copy objects
+    model = copy.deepcopy(model).to(device)
+    data = copy.deepcopy(data)
+
+    optimizer = optimizer(model.parameters(), lr=lr)
+
+    # Fit scalers
+    for key, value in data.items():
+        for k, v in value.items():
+            if k == 'scaler':
+                value['scaler'].fit(value['X_train'])
+                break
+
+    # Apply transforms when needed
+    data_train = {}
+    for key, value in data.items():
+        for k, v in value.items():
+            if ('X_' in k) and ('scaler' in value.keys()):
+                value[k] = value['scaler'].transform(value[k])
+
+            if all([k != 'scaler', k != 'loss', k != 'weight']):
+                value[k] = to_tensor(value[k])
+
+        data_train[key] = loader(
+                                 value['X_train'],
+                                 value['y_train'],
+                                 batch_size,
+                                 )
+
+    data_train = CombinedLoader(data_train, 'max_size')
+
+    df_loss = []
+    no_improv = 0
+    best_loss = float('inf')
+    for epoch in range(1, n_epochs+1):
+
+        model.train()
+
+        for batch, _, _ in data_train:
+
+            loss = 0.0
+            for indx in data.keys():
+
+                if batch[indx] is None:
+                    continue
+
+                X = batch[indx][0]
+                y = batch[indx][1]
+
+                p = model(X, indx)
+                i = data[indx]['loss'](p, y)
+
+                if 'weight' in data[indx].keys():
+                    i *= data[indx]['weight']
+
+                loss += i
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        with torch.no_grad():
+            model.eval()
+
+            all_loss = 0.0
+            for indx in data.keys():
+                y = data[indx]['y_train']
+                p = model(data[indx]['X_train'], indx)
+                loss = data[indx]['loss'](p, y).item()
+
+                split = 'train'
+                d = (epoch, loss, indx, split)
+                df_loss.append(d)
+
+                if 'y_val' in data[indx].keys():
+
+                    y = data[indx]['y_val']
+                    p = model(data[indx]['X_val'], indx)
+                    loss = data[indx]['loss'](p, y).item()
+
+                    split = 'val'
+                    d = (epoch, loss, indx, split)
+                    df_loss.append(d)
+
+                    all_loss += loss
+
+                else:
+                    all_loss += loss
+
+        # Early stopping
+        if all_loss < best_loss:
+            best_model = copy.deepcopy(model)
+            best_loss = all_loss
+            no_improv = 0
+
+        else:
+            no_improv = 1
+
+        if no_improv >= patience:
+            break
+
+        if epoch % print_n == 0:
+            print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}')
+
+    # Loss curve
+    columns = ['epoch', 'loss', 'data', 'split']
+    df_loss = pd.DataFrame(df_loss, columns=columns)
+
+    # Train parity
+    df_parity = pred(model, data)
+
+    save(
+         model,
+         df_parity,
+         df_loss,
+         data,
+         save_dir,
+         )
+
+    out = {
+           'model': best_model,
+           'df_parity': df_parity,
+           'df_loss': df_loss,
+           'data': data,
+           }
+
+    return out
+
+
+
+
+
+
+
+

Functions

+
+
+def loader(X, y, batch_size=32, shuffle=True) +
+
+

A wrapper to load data for pytorch.

+

Args

+
+
X : torch.FloatTensor
+
The features.
+
y : torch.FloatTensor
+
The target values.
+
batch_size : int
+
The size of the batch for gradient descent.
+
shuffle : bool
+
Whether to shuffle data.
+
+

Returns

+
+
torch.utils.data.DataLoader
+
The data loader.
+
+
+ +Expand source code + +
def loader(X, y, batch_size=32, shuffle=True):
+    '''
+    A wrapper to load data for pytorch.
+
+    Args:
+        X (torch.FloatTensor): The features.
+        y (torch.FloatTensor): The target values.
+        batch_size (int): The size of the batch for gradient descent.
+        shuffle (bool): Whether to shuffle data.
+
+    Returns:
+        torch.utils.data.DataLoader: The data loader.
+    '''
+
+    data = TensorDataset(X, y)
+    data = DataLoader(
+                      data,
+                      batch_size=batch_size,
+                      shuffle=shuffle,
+                      )
+
+    return data
+
+
+
+def pred(model, data) +
+
+

Function to generate parity plot data predictions.

+

Args

+
+
model : object
+
The trained model.
+
data : dict
+
The data splits.
+
+

Returns

+
+
pd.DataFrame
+
Parity plot data.
+
+
+ +Expand source code + +
def pred(model, data):
+    '''
+    Function to generate parity plot data predictions.
+
+    Args:
+        model (object): The trained model.
+        data (dict): The data splits.
+
+    Returns:
+        pd.DataFrame: Parity plot data.
+    '''
+
+    df = []
+    with torch.no_grad():
+        for key, value in data.items():
+
+            for k, v in value.items():
+
+                if 'X_' in k:
+                    split = k.split('_')[1]
+                    X = value[k]
+                    y = value['y_'+split]
+                    d = pd.DataFrame()
+                    d['y'] = y.cpu().detach().view(-1)
+                    d['p'] = model(X, key).cpu().detach().view(-1)
+                    d['data'] = key
+                    d['split'] = split
+                    df.append(d)
+
+    df = pd.concat(df)
+
+    return df
+
+
+
+def save(model, df_parity, df_loss, data, save_dir='./outputs') +
+
+

Save results of run.

+

Args

+
+
model : object
+
The trained tensorflow model.
+
df_parity : pd.DataFrame
+
The parity plot data.
+
df_loss : pd.DataFrame
+
The learning curve data.
+
data : dict
+
The data splits.
+
save_dir : str
+
The location to save outputs.
+
+
+ +Expand source code + +
def save(
+         model,
+         df_parity,
+         df_loss,
+         data,
+         save_dir='./outputs',
+         ):
+
+    '''
+    Save results of run.
+
+    Args:
+        model (object): The trained tensorflow model.
+        df_parity (pd.DataFrame): The parity plot data.
+        df_loss (pd.DataFrame): The learning curve data.
+        data (dict): The data splits.
+        save_dir (str): The location to save outputs.
+    '''
+
+    os.makedirs(save_dir, exist_ok=True)
+
+    plots.generate(df_parity, df_loss, save_dir)
+
+    torch.save(
+               model,
+               os.path.join(save_dir, 'model.pth')
+               )
+
+    df_parity.to_csv(os.path.join(save_dir, 'predictions.csv'), index=False)
+    df_loss.to_csv(os.path.join(save_dir, 'loss_vs_epochs.csv'), index=False)
+
+    for key, value in data.items():
+
+        new_dir = os.path.join(save_dir, key)
+        for k, v in value.items():
+            if k == 'scaler':
+                dump(v, os.path.join(new_dir, 'scaler.joblib'))
+
+            elif k == 'loss':
+                dill.dump(
+                          v,
+                          open(os.path.join(new_dir, 'loss.pkl'), 'wb'),
+                          )
+
+            elif ('X_' in k) or ('y_' in k):
+
+                if 'X_' in k:
+                    v = v.cpu().detach()
+
+                np.savetxt(os.path.join(
+                                        new_dir,
+                                        f'{k}.csv',
+                                        ), v, delimiter=',')
+
+
+
+def to_tensor(x) +
+
+

Convert variable to tensor.

+

Args

+
+
x : np.ndarray
+
The variable to convert.
+
+

Returns

+
+
torch.FloatTensor
+
The converted variable.
+
+
+ +Expand source code + +
def to_tensor(x):
+    '''
+    Convert variable to tensor.
+
+    Args:
+        x (np.ndarray): The variable to convert.
+
+    Returns:
+        torch.FloatTensor: The converted variable.
+    '''
+
+    y = torch.FloatTensor(x).to(device)
+
+    if len(y.shape) < 2:
+        y = y.reshape(-1, 1)
+
+    return y
+
+
+
+def train(model, optimizer, data, n_epochs=1000, batch_size=32, lr=0.0001, save_dir='outputs', patience=inf, print_n=100) +
+
+

The training workflow for models.

+

Args

+
+
model : object
+
The model to train/assess.
+
optimizer : object
+
The torch optimizer
+
data : dict
+
The data with splits.
+
n_epochs : int
+
The number of epochs to train.
+
batch_size : int
+
The size of the batch for gradient descent.
+
lr : float
+
The learning rate.
+
save_dir : str
+
The location to save outputs.
+
patience : int
+
Stop training if no improvement after n epochs.
+
print_n : int
+
The interval to print loss.
+
+

Returns

+
+
dict
+
The trained model and plot data.
+
+
+ +Expand source code + +
def train(
+          model,
+          optimizer,
+          data,
+          n_epochs=1000,
+          batch_size=32,
+          lr=1e-4,
+          save_dir='outputs',
+          patience=np.inf,
+          print_n=100,
+          ):
+    '''
+    The training workflow for models.
+
+    Args:
+        model (object): The model to train/assess.
+        optimizer (object): The torch optimizer
+        data (dict): The data with splits.
+        n_epochs (int): The number of epochs to train.
+        batch_size (int): The size of the batch for gradient descent.
+        lr (float): The learning rate.
+        save_dir (str): The location to save outputs.
+        patience (int): Stop training if no improvement after n epochs.
+        print_n (int): The interval to print loss.
+
+    Returns:
+        dict: The trained model and plot data.
+    '''
+
+    # Copy objects
+    model = copy.deepcopy(model).to(device)
+    data = copy.deepcopy(data)
+
+    optimizer = optimizer(model.parameters(), lr=lr)
+
+    # Fit scalers
+    for key, value in data.items():
+        for k, v in value.items():
+            if k == 'scaler':
+                value['scaler'].fit(value['X_train'])
+                break
+
+    # Apply transforms when needed
+    data_train = {}
+    for key, value in data.items():
+        for k, v in value.items():
+            if ('X_' in k) and ('scaler' in value.keys()):
+                value[k] = value['scaler'].transform(value[k])
+
+            if all([k != 'scaler', k != 'loss', k != 'weight']):
+                value[k] = to_tensor(value[k])
+
+        data_train[key] = loader(
+                                 value['X_train'],
+                                 value['y_train'],
+                                 batch_size,
+                                 )
+
+    data_train = CombinedLoader(data_train, 'max_size')
+
+    df_loss = []
+    no_improv = 0
+    best_loss = float('inf')
+    for epoch in range(1, n_epochs+1):
+
+        model.train()
+
+        for batch, _, _ in data_train:
+
+            loss = 0.0
+            for indx in data.keys():
+
+                if batch[indx] is None:
+                    continue
+
+                X = batch[indx][0]
+                y = batch[indx][1]
+
+                p = model(X, indx)
+                i = data[indx]['loss'](p, y)
+
+                if 'weight' in data[indx].keys():
+                    i *= data[indx]['weight']
+
+                loss += i
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        with torch.no_grad():
+            model.eval()
+
+            all_loss = 0.0
+            for indx in data.keys():
+                y = data[indx]['y_train']
+                p = model(data[indx]['X_train'], indx)
+                loss = data[indx]['loss'](p, y).item()
+
+                split = 'train'
+                d = (epoch, loss, indx, split)
+                df_loss.append(d)
+
+                if 'y_val' in data[indx].keys():
+
+                    y = data[indx]['y_val']
+                    p = model(data[indx]['X_val'], indx)
+                    loss = data[indx]['loss'](p, y).item()
+
+                    split = 'val'
+                    d = (epoch, loss, indx, split)
+                    df_loss.append(d)
+
+                    all_loss += loss
+
+                else:
+                    all_loss += loss
+
+        # Early stopping
+        if all_loss < best_loss:
+            best_model = copy.deepcopy(model)
+            best_loss = all_loss
+            no_improv = 0
+
+        else:
+            no_improv = 1
+
+        if no_improv >= patience:
+            break
+
+        if epoch % print_n == 0:
+            print(f'Epoch {epoch}/{n_epochs}: {split} loss {loss:.2f}')
+
+    # Loss curve
+    columns = ['epoch', 'loss', 'data', 'split']
+    df_loss = pd.DataFrame(df_loss, columns=columns)
+
+    # Train parity
+    df_parity = pred(model, data)
+
+    save(
+         model,
+         df_parity,
+         df_loss,
+         data,
+         save_dir,
+         )
+
+    out = {
+           'model': best_model,
+           'df_parity': df_parity,
+           'df_loss': df_loss,
+           'data': data,
+           }
+
+    return out
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/setup.py b/setup.py index a9b0c55..26b6dac 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'multilearn' -version = '0.0.1' # Need to increment every time to push to PyPI +version = '0.0.2' # Need to increment every time to push to PyPI description = 'Multi-task learning with Pytorch.' url = 'https://github.com/leschultz/multilearn' author = 'Lane E. Schultz' diff --git a/src/multilearn/datasets.py b/src/multilearn/datasets.py index aa2415e..b753a4e 100644 --- a/src/multilearn/datasets.py +++ b/src/multilearn/datasets.py @@ -10,6 +10,20 @@ def splitter(X, y, names=None, train_size=1.0, val_size=0.0, test_size=0.0): + ''' + Split list of data into train, validation, and test splits. + + Args: + X (list): A list of features. + y (list): A list of target values. + names (list): A list of names for each dataset. + train_size (float): The fraction of training data. + val_size (float): The fraction of validation data. + test_size (float): The fraction of test data. + + Returns: + dict: A dictionary of data splits. + ''' n = len(X) if names is None: @@ -30,6 +44,19 @@ def splitter(X, y, names=None, train_size=1.0, val_size=0.0, test_size=0.0): def split(X, y, train_size=1.0, val_size=0.0, test_size=0.0): + ''' + Split data into train, validation, and test splits. + + Args: + X (np.ndarray): A list of features. + y (np.ndarray): A list of target values. + train_size (float): The fraction of training data. + val_size (float): The fraction of validation data. + test_size (float): The fraction of test data. + + Returns: + dict: A dictionary of data splits. + ''' # Make sure data splits sum to 1 assert train_size+val_size+test_size == 1.0, ( @@ -83,6 +110,15 @@ def split(X, y, train_size=1.0, val_size=0.0, test_size=0.0): def load(names): + ''' + Load data included with the package. + + Args: + names (list): A list of data to load. + + Returns: + Tuple[list, list]: A tuple of lists of features and target variables. + ''' Xs = [] ys = [] diff --git a/src/multilearn/models.py b/src/multilearn/models.py index 42460ef..cb6e0ab 100644 --- a/src/multilearn/models.py +++ b/src/multilearn/models.py @@ -2,6 +2,10 @@ class MultiNet(nn.Module): + ''' + A general model for building multi-target learning NNs. + Each separation of layers is symmetric across input datasets. + ''' def __init__( self, @@ -42,6 +46,16 @@ def separate(arch, tasks, is_out=False): self.out = separate(out_arch, tasks, True) def forward(self, x, prop): + ''' + Use a model to predict. + + Args: + x (nn.tensor): The features. + prop: The property to predict. + + Returns: + torch.FloatTensor: The predicted target value. + ''' for i in self.input[prop]: x = i(x) diff --git a/src/multilearn/plots.py b/src/multilearn/plots.py index 57ead22..829f1d6 100644 --- a/src/multilearn/plots.py +++ b/src/multilearn/plots.py @@ -16,11 +16,11 @@ def plot_dump(data, fig, ax, save, legend=True): ''' Function to dump figures. - inputs: - data = Data to dump in json file. - fig = Figure object. - ax = Axes object. - save = The location to save plot. + Args: + data (dict): Data to dump in json file. + fig (object): Figure object. + ax (object): Axes object. + save (str): The location to save plot. ''' fig.tight_layout() @@ -59,10 +59,14 @@ def plot_dump(data, fig, ax, save, legend=True): def parity(y, y_pred, sigma_y, save, color): ''' - Make a paroody plot. - - inputs: - save = The directory to save plot. + Make a parity plot. + + Args: + y (np.ndarray): The true target variable. + y_pred (np.ndarray): The predicted target variable. + sigma_y (float): The standard deviation of y. + save (str): The directory to save plot. + color (str): The color of the plot. ''' rmse = metrics.mean_squared_error(y, y_pred)**0.5 @@ -144,6 +148,15 @@ def generate( save='.', ): + ''' + Generate both parity and learning curve plots. + + Args: + df_parity (pd.DataFrame): Parity plot data. + df_loss (pd.DataFrame): Learning curve data. + save (str): Location to save all outputs. + ''' + for group, values in df_parity.groupby(['data', 'split']): y = values['y'] @@ -188,6 +201,16 @@ def generate( def learning_curve(x, y, save, group, color): + ''' + Plot the loss versus the epoch. + + Args: + x (list): The epochs. + y (list): The loss. + save (str): The save location. + group (str): The data set in question. + color (str): The plot color. + ''' # Regular plot fig, ax = pl.subplots() diff --git a/src/multilearn/utils.py b/src/multilearn/utils.py index d101d62..d8d0061 100644 --- a/src/multilearn/utils.py +++ b/src/multilearn/utils.py @@ -26,6 +26,17 @@ def save( save_dir='./outputs', ): + ''' + Save results of run. + + Args: + model (object): The trained tensorflow model. + df_parity (pd.DataFrame): The parity plot data. + df_loss (pd.DataFrame): The learning curve data. + data (dict): The data splits. + save_dir (str): The location to save outputs. + ''' + os.makedirs(save_dir, exist_ok=True) plots.generate(df_parity, df_loss, save_dir) @@ -63,6 +74,16 @@ def save( def to_tensor(x): + ''' + Convert variable to tensor. + + Args: + x (np.ndarray): The variable to convert. + + Returns: + torch.FloatTensor: The converted variable. + ''' + y = torch.FloatTensor(x).to(device) if len(y.shape) < 2: @@ -72,6 +93,18 @@ def to_tensor(x): def loader(X, y, batch_size=32, shuffle=True): + ''' + A wrapper to load data for pytorch. + + Args: + X (torch.FloatTensor): The features. + y (torch.FloatTensor): The target values. + batch_size (int): The size of the batch for gradient descent. + shuffle (bool): Whether to shuffle data. + + Returns: + torch.utils.data.DataLoader: The data loader. + ''' data = TensorDataset(X, y) data = DataLoader( @@ -84,6 +117,16 @@ def loader(X, y, batch_size=32, shuffle=True): def pred(model, data): + ''' + Function to generate parity plot data predictions. + + Args: + model (object): The trained model. + data (dict): The data splits. + + Returns: + pd.DataFrame: Parity plot data. + ''' df = [] with torch.no_grad(): @@ -118,6 +161,23 @@ def train( patience=np.inf, print_n=100, ): + ''' + The training workflow for models. + + Args: + model (object): The model to train/assess. + optimizer (object): The torch optimizer + data (dict): The data with splits. + n_epochs (int): The number of epochs to train. + batch_size (int): The size of the batch for gradient descent. + lr (float): The learning rate. + save_dir (str): The location to save outputs. + patience (int): Stop training if no improvement after n epochs. + print_n (int): The interval to print loss. + + Returns: + dict: The trained model and plot data. + ''' # Copy objects model = copy.deepcopy(model).to(device)