Skip to content

Commit

Permalink
Merge pull request #192 from byu-dml/remove-redundancies
Browse files Browse the repository at this point in the history
Remove redundancies
  • Loading branch information
bjschoenfeld authored Aug 16, 2019
2 parents 09510e3 + 71382b9 commit 1db4714
Show file tree
Hide file tree
Showing 23 changed files with 1,433 additions and 4,460 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ENV/

# IDE configurations
.idea/
.vscode/

# miscellaneous
test_all_datasets.py
Expand Down
49 changes: 49 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Contributing to `metalearn`

## Adding New Metafeatures

`metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system.

When adding a new metafeature to the package, the function that computes that metafeature needs to be included in the `Metafeatures` class definition in the `Metafeatures._mfs_info` attribute in [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py). Before the function can be included though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package.

Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature.

```python
# Import needed utilities
from metalearn.metafeatures.base import MetafeatureComputer
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup

# Declare the function that computes the metafeatures.
def get_dataset_stats(X, column_types):
# Calculate metafeatures.
number_of_instances = X.shape[0]
number_of_features = X.shape[1]
# Return a tuple (here it's two metafeatures).
return (number_of_instances, number_of_features)

# Decorate the metafeature-computing function with data
# the package will use.
get_dataset_stats = MetafeatureComputer(
# Pass the function into the `MetafeatureComputer`
# decorator.
computer=get_dataset_stats,
# Give each metafeature returned by the function a
# name for the cache to use (order here must match the
# order they are returned in by `computer`).
returns=[
"NumberOfInstances",
"NumberOfFeatures"
],
# Associate a problem type with the new metafeatures.
problem_type=ProblemType.ANY,
# Associate one or more metafeature groups.
groups=[MetafeatureGroup.SIMPLE],
# Specify which values to pass to the function
# when calling it to compute the metafeatures.
# Here we are passing the cached resource called
# "X_raw" as the value for this function's "X" argument.
argmap={ "X": "X_raw" }
)
```

By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a call to `build_resources_info`, which is then imported by [./metalearn/metafeatures/metafeatures.py](./metalearn/metafeatures/metafeatures.py) and added to the `_mfs_info` attribute of that module's `Metafeatures` class.
3 changes: 1 addition & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
include metalearn/metafeatures/metafeatures.json
include metalearn/metafeatures/metafeatures_schema.json
include metalearn/metafeatures/metafeatures_schema.json
2 changes: 1 addition & 1 deletion metalearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metafeatures.metafeatures import Metafeatures
from .metafeatures.resources import METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA
from .metafeatures.static_assets import METAFEATURES_JSON_SCHEMA_PATH
128 changes: 128 additions & 0 deletions metalearn/metafeatures/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from collections.abc import Mapping
import inspect
import itertools
from typing import List, Callable, Dict, Union, Optional, Any

from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup


class ResourceComputer:
"""
Decorates ``computer``, a resource computing function with metadata about that function.
Parameters
----------
computer
The function that computes the resources.
returns
The names of the resources that ``computer`` returns, specified in the same order as ``computer`` returns
them.
argmap
A custom map of ``computer``'s argument names to the global resource names that will be passed as
``computer``'s arguments when ``computer`` is called.
"""

def __init__(
self, computer: Callable, returns: List[str], argmap: Optional[Dict[str,Any]] = None
) -> None:
argspec = inspect.getfullargspec(computer)
# TODO: If needed, add support for `computer` functions that use these types of arguments.
if (
argspec.varargs is not None or argspec.varkw is not None or argspec.defaults is not None or
len(argspec.kwonlyargs) > 0
):
raise ValueError('`computer` must use only positional arguments with no default values')

self.computer = computer
self.returns = returns
self.argmap = {arg_name: arg_name for arg_name in argspec.args}

if argmap is not None:
# override computer arg value with developer provided values
# Note each value in `argmap` is a global resource name (e.g. `'XSample'`) or a literal value (e.g. `5`)
self.argmap.update(argmap)

def __call__(self, *args, **kwargs):
"""
Allows a ``ResourceComputer`` instance to be callable. Just forwards all arguments on to self.computer.
"""
return self.computer(*args, **kwargs)

@property
def name(self) -> str:
"""Returns the function name of self.computer"""
return self.computer.__name__


class MetafeatureComputer(ResourceComputer):
"""
Decorates ``computer``, a metafeature computing function
with metadata about that function.
Parameters
----------
computer
The function that computes the metafeatures.
returns
The names of the metafeatures that ``computer`` returns, specified in
the same order as ``computer`` returns them.
problem_type
The type of ML problem `computer`'s metafeatures can be computed for.
groups
The metafeature groups this computer's returned metafeatures belong to.
e.g. statistical, info-theoretic, simple, etc.
argmap
A custom map of ``computer``'s argument names to the global resource names
that will be passed as ``computer``'s arguments when ``computer`` is called.
"""

def __init__(
self, computer: Callable, returns: List[str], problem_type: ProblemType, groups: List[MetafeatureGroup],
argmap: Optional[Dict[str,str]] = None
) -> None:
# TODO: Add support for passing a string to `returns`, not just a list?
super().__init__(computer, returns, argmap)
self.groups = groups
self.problem_type = problem_type


class collectordict(Mapping):
"""
A partially mutable mapping in which keys can be set at most one time.
A LookupError is raised if a key is set more than once. Keys cannot be deleted.
For simplicity, all values must be set manually, not in __init__.
"""

dict_cls = dict

def __init__(self):
self._dict = self.dict_cls()

def __getitem__(self, key):
return self._dict[key]

def __iter__(self):
return iter(self._dict)

def __len__(self):
return len(self._dict)

def __setitem__(self, key, value):
if key in self._dict:
raise LookupError(f'{key} already exists')
self._dict[key] = value

def update(self, mapping: Mapping):
for key, value in mapping.items():
self[key] = value


def build_resources_info(*computers: ResourceComputer) -> collectordict:
"""
Combines multiple resource computers into a mapping of resource name to computer
"""
resources_info = collectordict()
for computer in computers:
for resource_name in computer.returns:
resources_info[resource_name] = computer
return resources_info
10 changes: 6 additions & 4 deletions metalearn/metafeatures/common_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from scipy.stats import skew, kurtosis

import metalearn.metafeatures.constants as consts

def profile_distribution(data):
"""
Compute the mean, standard deviation, min, quartile1, quartile2, quartile3, and max of a vector
Expand All @@ -16,21 +18,21 @@ def profile_distribution(data):
features = dictionary containing the min, max, mean, and standard deviation
"""
if len(data) == 0:
return (data, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
else:
ddof = 1 if len(data) > 1 else 0
dist_mean = np.mean(data)
dist_stdev = np.std(data, ddof=ddof)
dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max = np.percentile(data, [0,25,50,75,100])
dist_skew = skew(data)
dist_kurtosis = kurtosis(data)
return (data, dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)
return (dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)

def get_numeric_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "NUMERIC"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.NUMERIC]

def get_categorical_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "CATEGORICAL"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.CATEGORICAL]

def dtype_is_numeric(dtype):
return "int" in str(dtype) or "float" in str(dtype)
27 changes: 27 additions & 0 deletions metalearn/metafeatures/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from enum import Enum

# Constant Enums
class ProblemType(Enum):
CLASSIFICATION = 'classification'
REGRESSION = 'regression'
ANY = 'any'

class MetafeatureGroup(Enum):
ALL = 'all'
SIMPLE = 'simple'
TEXT = 'text'
STATISTICAL = 'statistical'
INFO_THEORETIC = 'info_theoretic'
LANDMARKING = 'landmarking'
MODEL_BASED = 'model_based'
TARGET_DEPENDENT = 'target_dependent'

# Constant strings
VALUE_KEY = 'value'
COMPUTE_TIME_KEY = 'compute_time'
NUMERIC = 'NUMERIC'
TEXT = 'TEXT'
CATEGORICAL = 'CATEGORICAL'
NO_TARGETS = 'NO_TARGETS'
NUMERIC_TARGETS = 'NUMERIC_TARGETS'
TIMEOUT = 'TIMEOUT'
Loading

0 comments on commit 1db4714

Please sign in to comment.