-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #192 from byu-dml/remove-redundancies
Remove redundancies
- Loading branch information
Showing
23 changed files
with
1,433 additions
and
4,460 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,6 +102,7 @@ ENV/ | |
|
||
# IDE configurations | ||
.idea/ | ||
.vscode/ | ||
|
||
# miscellaneous | ||
test_all_datasets.py | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Contributing to `metalearn` | ||
|
||
## Adding New Metafeatures | ||
|
||
`metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system. | ||
|
||
When adding a new metafeature to the package, the function that computes that metafeature needs to be included in the `Metafeatures` class definition in the `Metafeatures._mfs_info` attribute in [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py). Before the function can be included though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package. | ||
|
||
Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature. | ||
|
||
```python | ||
# Import needed utilities | ||
from metalearn.metafeatures.base import MetafeatureComputer | ||
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup | ||
|
||
# Declare the function that computes the metafeatures. | ||
def get_dataset_stats(X, column_types): | ||
# Calculate metafeatures. | ||
number_of_instances = X.shape[0] | ||
number_of_features = X.shape[1] | ||
# Return a tuple (here it's two metafeatures). | ||
return (number_of_instances, number_of_features) | ||
|
||
# Decorate the metafeature-computing function with data | ||
# the package will use. | ||
get_dataset_stats = MetafeatureComputer( | ||
# Pass the function into the `MetafeatureComputer` | ||
# decorator. | ||
computer=get_dataset_stats, | ||
# Give each metafeature returned by the function a | ||
# name for the cache to use (order here must match the | ||
# order they are returned in by `computer`). | ||
returns=[ | ||
"NumberOfInstances", | ||
"NumberOfFeatures" | ||
], | ||
# Associate a problem type with the new metafeatures. | ||
problem_type=ProblemType.ANY, | ||
# Associate one or more metafeature groups. | ||
groups=[MetafeatureGroup.SIMPLE], | ||
# Specify which values to pass to the function | ||
# when calling it to compute the metafeatures. | ||
# Here we are passing the cached resource called | ||
# "X_raw" as the value for this function's "X" argument. | ||
argmap={ "X": "X_raw" } | ||
) | ||
``` | ||
|
||
By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a call to `build_resources_info`, which is then imported by [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py) and added to the `_mfs_info` attribute of that module's `Metafeatures` class. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1 @@ | ||
include metalearn/metafeatures/metafeatures.json | ||
include metalearn/metafeatures/metafeatures_schema.json | ||
include metalearn/metafeatures/metafeatures_schema.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
from .metafeatures.metafeatures import Metafeatures | ||
from .metafeatures.resources import METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA | ||
from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA_PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
from collections.abc import Mapping | ||
import inspect | ||
import itertools | ||
from typing import List, Callable, Dict, Union, Optional, Any | ||
|
||
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup | ||
|
||
|
||
class ResourceComputer: | ||
""" | ||
Decorates ``computer``, a resource computing function with metadata about that function. | ||
Parameters | ||
---------- | ||
computer | ||
The function that computes the resources. | ||
returns | ||
The names of the resources that ``computer`` returns, specified in the same order as ``computer`` returns | ||
them. | ||
argmap | ||
A custom map of ``computer``'s argument names to the global resource names that will be passed as | ||
``computer``'s arguments when ``computer`` is called. | ||
""" | ||
|
||
def __init__( | ||
self, computer: Callable, returns: List[str], argmap: Optional[Dict[str,Any]] = None | ||
) -> None: | ||
argspec = inspect.getfullargspec(computer) | ||
# TODO: If needed, add support for `computer` functions that use these types of arguments. | ||
if ( | ||
argspec.varargs is not None or argspec.varkw is not None or argspec.defaults is not None or | ||
len(argspec.kwonlyargs) > 0 | ||
): | ||
raise ValueError('`computer` must use only positional arguments with no default values') | ||
|
||
self.computer = computer | ||
self.returns = returns | ||
self.argmap = {arg_name: arg_name for arg_name in argspec.args} | ||
|
||
if argmap is not None: | ||
# override computer arg value with developer provided values | ||
# Note each value in `argmap` is a global resource name (e.g. `'XSample'`) or a literal value (e.g. `5`) | ||
self.argmap.update(argmap) | ||
|
||
def __call__(self, *args, **kwargs): | ||
""" | ||
Allows a ``ResourceComputer`` instance to be callable. Just forwards all arguments on to self.computer. | ||
""" | ||
return self.computer(*args, **kwargs) | ||
|
||
@property | ||
def name(self) -> str: | ||
"""Returns the function name of self.computer""" | ||
return self.computer.__name__ | ||
|
||
|
||
class MetafeatureComputer(ResourceComputer): | ||
""" | ||
Decorates ``computer``, a metafeature computing function | ||
with metadata about that function. | ||
Parameters | ||
---------- | ||
computer | ||
The function that computes the metafeatures. | ||
returns | ||
The names of the metafeatures that ``computer`` returns, specified in | ||
the same order as ``computer`` returns them. | ||
problem_type | ||
The type of ML problem `computer`'s metafeatures can be computed for. | ||
groups | ||
The metafeature groups this computer's returned metafeatures belong to. | ||
e.g. statistical, info-theoretic, simple, etc. | ||
argmap | ||
A custom map of ``computer``'s argument names to the global resource names | ||
that will be passed as ``computer``'s arguments when ``computer`` is called. | ||
""" | ||
|
||
def __init__( | ||
self, computer: Callable, returns: List[str], problem_type: ProblemType, groups: List[MetafeatureGroup], | ||
argmap: Optional[Dict[str,str]] = None | ||
) -> None: | ||
# TODO: Add support for passing a string to `returns`, not just a list? | ||
super().__init__(computer, returns, argmap) | ||
self.groups = groups | ||
self.problem_type = problem_type | ||
|
||
|
||
class collectordict(Mapping): | ||
""" | ||
A partially mutable mapping in which keys can be set at most one time. | ||
A LookupError is raised if a key is set more than once. Keys cannot be deleted. | ||
For simplicity, all values must be set manually, not in __init__. | ||
""" | ||
|
||
dict_cls = dict | ||
|
||
def __init__(self): | ||
self._dict = self.dict_cls() | ||
|
||
def __getitem__(self, key): | ||
return self._dict[key] | ||
|
||
def __iter__(self): | ||
return iter(self._dict) | ||
|
||
def __len__(self): | ||
return len(self._dict) | ||
|
||
def __setitem__(self, key, value): | ||
if key in self._dict: | ||
raise LookupError(f'{key} already exists') | ||
self._dict[key] = value | ||
|
||
def update(self, mapping: Mapping): | ||
for key, value in mapping.items(): | ||
self[key] = value | ||
|
||
|
||
def build_resources_info(*computers: ResourceComputer) -> collectordict: | ||
""" | ||
Combines multiple resource computers into a mapping of resource name to computer | ||
""" | ||
resources_info = collectordict() | ||
for computer in computers: | ||
for resource_name in computer.returns: | ||
resources_info[resource_name] = computer | ||
return resources_info |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from enum import Enum | ||
|
||
# Constant Enums | ||
class ProblemType(Enum): | ||
CLASSIFICATION = 'classification' | ||
REGRESSION = 'regression' | ||
ANY = 'any' | ||
|
||
class MetafeatureGroup(Enum): | ||
ALL = 'all' | ||
SIMPLE = 'simple' | ||
TEXT = 'text' | ||
STATISTICAL = 'statistical' | ||
INFO_THEORETIC = 'info_theoretic' | ||
LANDMARKING = 'landmarking' | ||
MODEL_BASED = 'model_based' | ||
TARGET_DEPENDENT = 'target_dependent' | ||
|
||
# Constant strings | ||
VALUE_KEY = 'value' | ||
COMPUTE_TIME_KEY = 'compute_time' | ||
NUMERIC = 'NUMERIC' | ||
TEXT = 'TEXT' | ||
CATEGORICAL = 'CATEGORICAL' | ||
NO_TARGETS = 'NO_TARGETS' | ||
NUMERIC_TARGETS = 'NUMERIC_TARGETS' | ||
TIMEOUT = 'TIMEOUT' |
Oops, something went wrong.