-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluator.py
161 lines (138 loc) · 5.54 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Assumptions:
- Only restricring to columnar dataframes, where input
can be represented as a dataframe
- Input data has a column which has the target label/value
of the example, which will be used to measure the performance of the model
- The platform has a CLI interface which takes a config file which consists
of options likein list of models to be evaluated, input data source,
output data sink, input feature names, output label
- Data fits in memory
"""
import json
import click
import pandas as pd
from typing import Dict, Tuple
from model import TrainedModel
from exceptions import ConfigValidationException
class Evaluator(object):
def __init__(self, config: Dict) -> None:
self.config = config
def validate_config(self) -> None:
"""
Runs common validations on the config file, like
- all the required keys are present
- the config values have the right data format
"""
keys_format = {
'input_source': dict,
'output_sink': dict,
'input_features': list,
'output_label': str,
'models': list,
'metric': str
}
for key, data_type in keys_format.items():
# Check for presence of required keys
if key not in self.config:
raise ConfigValidationException(f"{key} not present in config \
file")
# Check if the keys have correct data type
if type(self.config[key]) is not data_type:
raise ConfigValidationException(f"{key} not in correct format, \
should be of type {data_type}")
def _load_input_file(self, input_source: Dict) -> pd.DataFrame:
"""
Loads the input from a file data source and returns as a dataframe
"""
print(f"Fetching inputs from {input_source['location']}, type=file")
df = pd.read_csv(input_source['location'])
return df
def _load_input_database(self, input_source: Dict) -> pd.DataFrame:
"""
Loads the input from a database table source and returns as a dataframe
The input source configuration can have options like database
credentials, table name etc
"""
# Not implemented
pass
def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Loads the input features and the target variable as per the config
options
Based on the input source type, calls the relevant helper functions.
"""
input_source = self.config['input_source']
input_features = self.config['input_features']
output_label = self.config['output_label']
input_type = input_source['type']
if input_type == "file":
df = self._load_input_file(input_source)
elif input_type == "database":
df = self._load_input_database(input_source)
# Reorder columns based on input_features
input_data = df[input_features]
# Get the target variable
output = df[[output_label]]
return input_data, output
def _write_best_model_file(self, output_sink: Dict, best_model: Dict) -> None: # noqa
"""
Writes the best model to a file. Uses the TrainedModel class to dump
the model to disk (using joblib)
"""
model = best_model["model"]
result = best_model["result"]
location = output_sink["location"]
print(f"Generating data from {model.metadata['name']} and storing results into {location}.") # noqa
with open(location, 'w') as f:
f.write(f"{result}%")
def _write_best_model_database(self, output_sink: Dict, best_model: Dict) -> None: # noqa
"""
Writes the best model to a database.
The output sink configuration can have options like database
credentials, table name etc
"""
# Not Implemented
pass
def write_best_model(self, best_model: Dict) -> None:
"""
Takes the best model found and writes it as per the output
sink configuration.
Based on the output sink type, calls the relevant helper function
"""
output_sink = self.config["output_sink"]
output_type = output_sink["type"]
if output_type == "file":
self._write_best_model_file(output_sink, best_model)
elif output_type == "database":
self._write_best_model_database(output_sink, best_model)
def run(self) -> None:
# We will load the data first and then iteratively run
# the models and store the predictions
input_data, actual_output = self.load_data()
model_paths = self.config["models"]
metric = self.config["metric"]
best_model = {
"model": None,
"result": -1
}
for model_path in model_paths:
model = TrainedModel.load(model_path)
name = model.metadata["name"]
print(f"Running model {name}",)
result = model.evaluate(input_data, actual_output, metric)
print(f"{result}%")
if result > best_model["result"]:
best_model["model"] = model
best_model["result"] = result
self.write_best_model(best_model)
@click.command()
@click.option('--config_file')
def run(config_file):
with open(config_file) as f:
config = json.load(f)
evaluator = Evaluator(config)
evaluator.validate_config()
evaluator.run()
if __name__ == "__main__":
run()