Skip to content

Commit

Permalink
Merge pull request #203 from byu-dml/remove_compute_time
Browse files Browse the repository at this point in the history
Remove compute time
  • Loading branch information
bjschoenfeld authored Nov 14, 2019
2 parents 9274653 + 56899bc commit ffe55fa
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 710 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,18 @@ mfs = metafeatures.compute(
seed=0,
n_folds=2,
verbose=True,
timeout=10
timeout=10,
return_times=True,
)

print(mfs)

# RatioOfNumericFeatures
# {'RatioOfNumericFeatures': {'value': 0.5, 'compute_time': 3.9138991269283e-05}}
```
**Warning:** Metafeatures are timed as if each dependency has to be recomputed whenever it is needed.
This means that the returned times may not be accurate for a particular application, especially if a
metafeature depends on a computationally intensive resource in multiple places.

## Using the Test Suite

Expand Down
6 changes: 6 additions & 0 deletions metalearn/metafeatures/general_resource_computers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
import metalearn.metafeatures.constants as consts


def get_X(X_raw):
return X_raw.dropna(axis=1, how="all"),

get_X = ResourceComputer(get_X, ["X"])

def get_cv_seed(seed_base, seed_offset):
return (seed_base + seed_offset,)

Expand Down Expand Up @@ -235,6 +240,7 @@ def get_text_features_with_no_missing_values(
instances in this module.
"""
resources_info = build_resources_info(
get_X,
get_cv_seed,
sample_columns,
sample_rows,
Expand Down
41 changes: 27 additions & 14 deletions metalearn/metafeatures/metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class Metafeatures(object):

# noop resource computers for the user-provided resources
# `_get_arguments` and `_resource_is_target_dependent` assumes ResourceComputer's
for resource_name in ["X_raw", "X", "Y", "column_types", "sample_shape", "seed_base", "n_folds"]:
for resource_name in ["X_raw", "Y", "column_types", "sample_shape", "seed_base", "n_folds"]:
_resources_info[resource_name] = ResourceComputer(lambda: None, [resource_name])

_mfs_info = [
Expand Down Expand Up @@ -85,7 +85,7 @@ def compute(
self, X: DataFrame, Y: Series=None,
column_types: Dict[str, str]=None, metafeature_ids: List=None,
exclude: List=None, sample_shape=None, seed=None, n_folds=2,
verbose=False, timeout=None
verbose=False, timeout=None, return_times=False
) -> dict:
"""
Parameters
Expand All @@ -112,6 +112,9 @@ def compute(
will be run to completion. Otherwise, execution will halt after
approximately timeout seconds. Any metafeatures that have not been
computed will be labeled 'TIMEOUT'.
return_times: bool, default False. When true, includes compute times for
each metafeature. **Note** compute times are are overestimated.
See https://github.com/byu-dml/metalearn/issues/205.
Returns
-------
Expand All @@ -123,7 +126,7 @@ def compute(
start_time = time.time()
self._validate_compute_arguments(
X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
)
if timeout is None:
def check_time():
Expand All @@ -145,7 +148,7 @@ def check_time():
seed = np.random.randint(np.iinfo(np.int32).max)
self._validate_compute_arguments(
X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
)

self._init_resources(
Expand Down Expand Up @@ -176,6 +179,10 @@ def check_time():
except TimeoutError:
pass

if not return_times:
for mf, result_dict in computed_metafeatures.items():
del result_dict[consts.COMPUTE_TIME_KEY]

return computed_metafeatures

def _format_resource(self, value, compute_time):
Expand All @@ -191,7 +198,6 @@ def _init_resources(
# Add the base resources to our resources hash
self._resources = {
"X_raw": self._format_resource(X, 0.), # TODO: rename to X
"X": self._format_resource(X.dropna(axis=1, how="all"), 0.), # TODO: make resource computer; rename
"Y": self._format_resource(Y, 0.),
"column_types": self._format_resource(column_types, 0.),
"sample_shape": self._format_resource(sample_shape, 0.),
Expand All @@ -216,7 +222,7 @@ def _resource_is_target_dependent(cls, resource_id):

def _validate_compute_arguments(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
for f in [
self._validate_X, self._validate_Y, self._validate_column_types,
Expand All @@ -225,12 +231,12 @@ def _validate_compute_arguments(
]:
f(
X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
)

def _validate_X(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not isinstance(X, pd.DataFrame):
raise TypeError('X must be of type pandas.DataFrame')
Expand All @@ -239,7 +245,7 @@ def _validate_X(

def _validate_Y(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not isinstance(Y, pd.Series) and not Y is None:
raise TypeError('Y must be of type pandas.Series')
Expand All @@ -248,7 +254,7 @@ def _validate_Y(

def _validate_column_types(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not column_types is None:
invalid_column_types = {}
Expand All @@ -272,7 +278,7 @@ def _validate_column_types(

def _validate_metafeature_ids(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
ids = None
if metafeature_ids is not None and exclude is not None:
Expand All @@ -295,7 +301,7 @@ def _validate_metafeature_ids(

def _validate_sample_shape(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not sample_shape is None:
if not type(sample_shape) in [tuple, list]:
Expand All @@ -317,7 +323,7 @@ def _validate_sample_shape(

def _validate_n_folds(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not dtype_is_numeric(type(n_folds)) or (n_folds != int(n_folds)):
raise ValueError(f"`n_folds` must be an integer, not {n_folds}")
Expand All @@ -344,11 +350,18 @@ def _validate_n_folds(

def _validate_verbose(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose
n_folds, verbose, return_times
):
if not type(verbose) is bool:
raise ValueError("`verbose` must be of type bool.")

def _validate_return_times(
self, X, Y, column_types, metafeature_ids, exclude, sample_shape, seed,
n_folds, verbose, return_times
):
if not type(return_times) is bool:
raise ValueError("`return_times` must be of type bool.")

# todo: intelligently infer TEXT data type
def _infer_column_types(self, X, Y):
column_types = {}
Expand Down
3 changes: 1 addition & 2 deletions metalearn/metafeatures/metafeatures_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
"dataset_metafeature": {
"type": "object",
"required": [
"value",
"compute_time"
"value"
],
"properties": {
"value": {
Expand Down
Loading

0 comments on commit ffe55fa

Please sign in to comment.