Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove redundancies #192

Merged
merged 21 commits into from
Aug 16, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ENV/

# IDE configurations
.idea/
.vscode/

# miscellaneous
test_all_datasets.py
Expand Down
49 changes: 49 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Contributing to `metalearn`

## Adding New Metafeatures

`metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system.

When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py). Before the function can be registered though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package.

Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature.

```python
# Import needed utilities
from metalearn.metafeatures.base import MetafeatureComputer
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup

# Declare the function that computes the metafeatures.
def get_dataset_stats(X, column_types):
# Calculate metafeatures.
number_of_instances = X.shape[0]
number_of_features = X.shape[1]
# Return a tuple (here it's two metafeatures).
return (number_of_instances, number_of_features)

# Decorate the metafeature-computing function with data
# the package will use.
get_dataset_stats = MetafeatureComputer(
# Pass the function into the `MetafeatureComputer`
# decorator.
computer=get_dataset_stats,
# Give each metafeature returned by the function a
# name for the cache to use (order here must match the
# order they are returned in by `computer`).
returns=[
"NumberOfInstances",
"NumberOfFeatures"
],
# Associate a problem type with the new metafeatures.
problem_type=ProblemType.ANY,
# Associate one or more metafeature groups.
groups=[MetafeatureGroup.SIMPLE],
# Specify which values to pass to the function
# when calling it to compute the metafeatures.
# Here we are passing the cached resource called
# "X_raw" as the value for this function's "X" argument.
argmap={ "X": "X_raw" }
)
```

By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable.
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
include metalearn/metafeatures/metafeatures.json
include metalearn/metafeatures/metafeatures_schema.json
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion metalearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metafeatures.metafeatures import Metafeatures
from .metafeatures.resources import METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA
from .metafeatures.resources import METAFEATURES_JSON_SCHEMA
176 changes: 176 additions & 0 deletions metalearn/metafeatures/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import inspect
from abc import ABC, abstractmethod
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
from typing import List, Callable, Dict, Union, Optional, Any
import itertools

from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup

class ResourceComputer:

def __init__(
self,
computer: Callable,
returns: List[str],
argmap: Optional[Dict[str,Any]] = {}
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""
Decorates ``computer``, a resource computing function
with metadata about that function.

Parameters
----------
computer
The function that computes the resources.
returns
The names of the resources that ``computer`` returns, specified in
the same order as ``computer`` returns them.
argmap
A custom map of ``computer``'s argument names to the global resource names
that will be passed as ``computer``'s arguments when ``computer`` is called.
"""
self._computer = computer
self.returns = returns

self.argmap = {}

# reversing is needed because `self.defaults` gives the default
# argument values corresponding to the *last* `n` arguments in the
# function signature.
reversed_args = self.args[::-1]
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
reversed_defaults = self.defaults[::-1]
arg_default_pairs = itertools.zip_longest(reversed_args, reversed_defaults)

for local_name, default in arg_default_pairs:
# By default, just use the `computer` function's
# normal local argument names in the argmap,
# making sure to preserve default argument values
# when they are supplied.
if default is not None:
# The function has a default value for this arg;
# use that.
self.argmap[local_name] = default
else:
# This function has no default. Tell the system
# to pass in the global resource identified by
# this arg's ``local_name`` when calling this
# ``computer``.
self.argmap[local_name] = local_name

for local_name, resource_name in argmap.items():
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
# Now include any argument name or value overrides
# the developer has provided. Note: `resource_name`
# may be a global resource name (e.g. `"XSample"`) or
# a direct value for the argument (e.g. `5`)
self.argmap[local_name] = resource_name

def __call__(self, *args, **kwargs):
"""
Allows a ``ResourceComputer`` instance to be callable.
Just forwards all arguments on to self._computer.
"""
return self._computer(*args, **kwargs)
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved

@property
def args(self) -> list:
"""Returns a list of the positional parameter names of self._computer"""
return inspect.getfullargspec(self._computer).args

@property
def defaults(self) -> list:
"""
From https://docs.python.org/3/library/inspect.html#inspect.getfullargspec
[Returns] an n-tuple of default argument values corresponding to the last `n`
positional parameters [of self._computer].
"""
defaults = inspect.getfullargspec(self._computer).defaults
return [] if defaults is None else defaults

@property
def name(self) -> str:
"""Returns the function name of self._computer"""
return self._computer.__name__


class MetafeatureComputer(ResourceComputer):

def __init__(
self,
computer: Callable,
returns: List[str], # TODO: Add support for passing just a string, not a list?
problem_type: ProblemType,
groups: List[MetafeatureGroup],
argmap: Optional[Dict[str,str]] = {}
) -> None:
"""
Decorates ``computer``, a metafeature computing function
with metadata about that function.

Parameters
----------
computer
The function that computes the metafeatures.
returns
The names of the metafeatures that ``computer`` returns, specified in
the same order as ``computer`` returns them.
problem_type
The type of ML problem `computer`'s metafeatures can be computed for.
groups
The metafeature groups this computer's returned metafeatures belong to.
e.g. statistical, info-theoretic, simple, etc.
argmap
A custom map of ``computer``'s argument names to the global resource names
that will be passed as ``computer``'s arguments when ``computer`` is called.
"""
super(MetafeatureComputer, self).__init__(computer, returns, argmap)
self.groups = groups
self.problem_type = problem_type


class ResourceComputerMap:
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None] = None) -> None:
"""
Wraps a dictionary map of resource names to their computers.
Includes visibility into whether duplicate computers
are trying to become associated with a resource in the map e.g.
if a package developer has accidentally declared two computers
that return the same resource.
"""
self._map: Dict[str,ResourceComputer] = {}
if computers is not None:
self.add(computers)

def __contains__(self, key):
"""Called to implement membership test operators. e.g. `key in my_resouce_map`."""
return key in self._map

def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None:
"""
Adds more resource name/resource computer key/value
pairs to a resource map, throwing an error on duplicates.
"""
if isinstance(computers, list):
for computer in computers:
self._add_one(computer)
elif isinstance(computers, ResourceComputer):
self._add_one(computers)
else:
raise ValueError("computers must be ResourceComputer or List[ResourceComputer]")

def get(self, key: str = None) -> Union[Dict[str,ResourceComputer],ResourceComputer]:
"""Used for getting the resource map."""
if key is not None:
return self._map[key]
return self._map

def _add_one(self, computer: ResourceComputer) -> None:
if not isinstance(computer, ResourceComputer):
raise ValueError(f"computer is not a ResourceComputer; it is a {type(computer)}")

for resource_name in computer.returns:
if resource_name in self._map:
raise ValueError(
f"duplicate computer '{computer.name}' provided for resource '{resource_name}', "
f"which is already present in the resouce map, registered "
f"by computer '{self.get(resource_name).name}'"
)
self._map[resource_name] = computer
10 changes: 6 additions & 4 deletions metalearn/metafeatures/common_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from scipy.stats import skew, kurtosis

import metalearn.metafeatures.constants as consts

def profile_distribution(data):
"""
Compute the mean, standard deviation, min, quartile1, quartile2, quartile3, and max of a vector
Expand All @@ -16,21 +18,21 @@ def profile_distribution(data):
features = dictionary containing the min, max, mean, and standard deviation
"""
if len(data) == 0:
return (data, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
else:
ddof = 1 if len(data) > 1 else 0
dist_mean = np.mean(data)
dist_stdev = np.std(data, ddof=ddof)
dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max = np.percentile(data, [0,25,50,75,100])
dist_skew = skew(data)
dist_kurtosis = kurtosis(data)
return (data, dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)
return (dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)

def get_numeric_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "NUMERIC"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.NUMERIC]

def get_categorical_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "CATEGORICAL"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.CATEGORICAL]

def dtype_is_numeric(dtype):
return "int" in str(dtype) or "float" in str(dtype)
27 changes: 27 additions & 0 deletions metalearn/metafeatures/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from enum import Enum

# Constant Enums
class ProblemType(Enum):
CLASSIFICATION = "classification"
REGRESSION = "regression"
ANY = "any"

class MetafeatureGroup(Enum):
ALL = "all"
SIMPLE = "simple"
TEXT = "text"
STATISTICAL = "statistical"
INFO_THEORETIC = "info_theoretic"
LANDMARKING = "landmarking"
MODEL_BASED = "model_based"
TARGET_DEPENDENT = "target_dependent"

# Constant strings
VALUE_KEY = 'value'
COMPUTE_TIME_KEY = 'compute_time'
NUMERIC = "NUMERIC"
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
TEXT = "TEXT"
CATEGORICAL = "CATEGORICAL"
NO_TARGETS = "NO_TARGETS"
NUMERIC_TARGETS = "NUMERIC_TARGETS"
TIMEOUT = "TIMEOUT"
Loading