Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove redundancies #192

Merged
merged 21 commits into from
Aug 16, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ENV/

# IDE configurations
.idea/
.vscode/

# miscellaneous
test_all_datasets.py
Expand Down
49 changes: 49 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Contributing to `metalearn`

## Adding New Metafeatures

`metalearn` uses a caching mechanism to cache expensive computations that may need to be used again within the package by another function. Both resources (e.g. the dataset itself or a preprocessed version of it) and metafeatures (e.g. entropy, number of features) are cached by the system.

When adding a new metafeature to the package, the function that computes that metafeature needs to be registered in the `resources_info` variable in [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py), and passed to the call made to `_get_metafeature_ids` in that module as well. Before the function can be registered and passed though, it needs to be decorated with metadata by being passed through the `MetafeatureComputer` constructor (see example below). This allows the metafeatures returned by the function to be used intelligently by the package.

Follow the example below to know how to write and register new metafeature(s). Note that a metafeature-computing function (e.g. `get_dataset_stats` as seen below) can compute and return more than one meta-feature.

```python
# Import needed utilities
from metalearn.metafeatures.base import MetafeatureComputer
from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup

# Declare the function that computes the metafeatures.
def get_dataset_stats(X, column_types):
# Calculate metafeatures.
number_of_instances = X.shape[0]
number_of_features = X.shape[1]
# Return a tuple (here it's two metafeatures).
return (number_of_instances, number_of_features)

# Decorate the metafeature-computing function with data
# the package will use.
get_dataset_stats = MetafeatureComputer(
# Pass the function into the `MetafeatureComputer`
# decorator.
computer=get_dataset_stats,
# Give each metafeature returned by the function a
# name for the cache to use (order here must match the
# order they are returned in by `computer`).
returns=[
"NumberOfInstances",
"NumberOfFeatures"
],
# Associate a problem type with the new metafeatures.
problem_type=ProblemType.ANY,
# Associate one or more metafeature groups.
groups=[MetafeatureGroup.SIMPLE],
# Specify which values to pass to the function
# when calling it to compute the metafeatures.
# Here we are passing the cached resource called
# "X_raw" as the value for this function's "X" argument.
argmap={ "X": "X_raw" }
)
```

By convention, all the decorated metafeature-computing functions in a module are aggregated at the bottom of the module into a list called `metafeature_computers`, which is then imported by [./metalearn/metafeatures/resources.py](./metalearn/metafeatures/resources.py) and added to that module's `resources_info` variable.
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
include metalearn/metafeatures/metafeatures.json
include metalearn/metafeatures/metafeatures_schema.json
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
2 changes: 1 addition & 1 deletion metalearn/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .metafeatures.metafeatures import Metafeatures
from .metafeatures.resources import METAFEATURE_CONFIG, METAFEATURES_JSON_SCHEMA
from .metafeatures.resources import METAFEATURES_JSON_SCHEMA
197 changes: 197 additions & 0 deletions metalearn/metafeatures/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
import inspect
from typing import List, Callable, Dict, Union, Optional, Any
import itertools
from collections.abc import MutableMapping

from metalearn.metafeatures.constants import ProblemType, MetafeatureGroup

class ResourceComputer:

def __init__(
self,
computer: Callable,
returns: List[str],
argmap: Optional[Dict[str,Any]] = None
) -> None:
"""
Decorates ``computer``, a resource computing function
with metadata about that function.

Parameters
----------
computer
The function that computes the resources.
returns
The names of the resources that ``computer`` returns, specified in
the same order as ``computer`` returns them.
argmap
A custom map of ``computer``'s argument names to the global resource names
that will be passed as ``computer``'s arguments when ``computer`` is called.
"""

computer_args = inspect.getfullargspec(computer)
# TODO: If needed, add support for `computer` functions that
# use these types of arguments.
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
if (
computer_args.varargs is not None or
computer_args.varkw is not None or
len(computer_args.kwonlyargs) > 0
):
raise ValueError((
"ResourceComputer supports `computer` functions that "
"use positional arguments only in their function definition."
))

self._computer = computer
self.returns = returns

self.argmap = {}

# reversing is needed because `self.defaults` gives the default
# argument values corresponding to the *last* `n` arguments in the
# function signature.
reversed_args = self.args[::-1]
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
reversed_defaults = self.defaults[::-1]
arg_default_pairs = itertools.zip_longest(reversed_args, reversed_defaults)

for local_name, default in arg_default_pairs:
# By default, just use the `computer` function's
# normal local argument names in the argmap,
# making sure to preserve default argument values
# when they are supplied.
if default is not None:
# The function has a default value for this arg;
# use that.
self.argmap[local_name] = default
else:
# This function has no default. Tell the system
# to pass in the global resource identified by
# this arg's ``local_name`` when calling this
# ``computer``.
self.argmap[local_name] = local_name

if argmap is not None:
# Now include any argument name or value overrides
# the developer has provided. Note: each value in `argmap`
# may be a global resource name (e.g. `"XSample"`) or
# a direct value for the argument (e.g. `5`)
self.argmap.update(argmap)


def __call__(self, *args, **kwargs):
"""
Allows a ``ResourceComputer`` instance to be callable.
Just forwards all arguments on to self._computer.
"""
return self._computer(*args, **kwargs)
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved

@property
def args(self) -> list:
"""Returns a list of the positional parameter names of self._computer"""
return inspect.getfullargspec(self._computer).args

@property
def defaults(self) -> list:
"""
From https://docs.python.org/3/library/inspect.html#inspect.getfullargspec
[Returns] an n-tuple of default argument values corresponding to the last `n`
positional parameters [of self._computer].
"""
defaults = inspect.getfullargspec(self._computer).defaults
return [] if defaults is None else defaults

@property
def name(self) -> str:
"""Returns the function name of self._computer"""
return self._computer.__name__


class MetafeatureComputer(ResourceComputer):

def __init__(
self,
computer: Callable,
returns: List[str], # TODO: Add support for passing just a string, not a list?
problem_type: ProblemType,
groups: List[MetafeatureGroup],
argmap: Optional[Dict[str,str]] = {}
) -> None:
"""
Decorates ``computer``, a metafeature computing function
with metadata about that function.

Parameters
----------
computer
The function that computes the metafeatures.
returns
The names of the metafeatures that ``computer`` returns, specified in
the same order as ``computer`` returns them.
problem_type
The type of ML problem `computer`'s metafeatures can be computed for.
groups
The metafeature groups this computer's returned metafeatures belong to.
e.g. statistical, info-theoretic, simple, etc.
argmap
A custom map of ``computer``'s argument names to the global resource names
that will be passed as ``computer``'s arguments when ``computer`` is called.
"""
super(MetafeatureComputer, self).__init__(computer, returns, argmap)
self.groups = groups
self.problem_type = problem_type


class ResourceComputerMap(MutableMapping):
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, computers: Union[ResourceComputer,List[ResourceComputer],None] = None) -> None:
"""
Wraps a dictionary map of resource names to their computers.
Includes visibility into whether duplicate computers
are trying to become associated with a resource in the map e.g.
if a package developer has accidentally declared two computers
that return the same resource.
"""
self._map: Dict[str,ResourceComputer] = {}
if computers is not None:
self.add(computers)

def add(self, computers: Union[ResourceComputer,List[ResourceComputer]]) -> None:
"""
Adds more resource name/resource computer key/value
pairs to a resource map, throwing an error on duplicates.
"""
if isinstance(computers, list):
for computer in computers:
self._add_one(computer)
elif isinstance(computers, ResourceComputer):
self._add_one(computers)
else:
raise ValueError("computers must be ResourceComputer or List[ResourceComputer]")

def __getitem__(self, key: str = None) -> ResourceComputer:
"""Used for getting a resource from the map."""
return self._map[key]

def _add_one(self, computer: ResourceComputer) -> None:
if not isinstance(computer, ResourceComputer):
raise ValueError(f"computer is not a ResourceComputer; it is a {type(computer)}")

for resource_name in computer.returns:
self.__setitem__(resource_name, computer)

def __setitem__(self, resource_name: str, computer: ResourceComputer):
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
if resource_name in self._map:
raise ValueError(
f"duplicate computer '{computer.name}' provided for resource '{resource_name}', "
f"which is already present in the resouce map, registered "
f"by computer '{self._map[resource_name].name}'"
)
self._map[resource_name] = computer

def __iter__(self):
return iter(self._map)

def __len__(self):
return len(self._map)

def __delitem__(self, key: str):
bjschoenfeld marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError("ResourceComputerMap does not support deletion of its ResourceComputers")
10 changes: 6 additions & 4 deletions metalearn/metafeatures/common_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from scipy.stats import skew, kurtosis

import metalearn.metafeatures.constants as consts

def profile_distribution(data):
"""
Compute the mean, standard deviation, min, quartile1, quartile2, quartile3, and max of a vector
Expand All @@ -16,21 +18,21 @@ def profile_distribution(data):
features = dictionary containing the min, max, mean, and standard deviation
"""
if len(data) == 0:
return (data, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)
else:
ddof = 1 if len(data) > 1 else 0
dist_mean = np.mean(data)
dist_stdev = np.std(data, ddof=ddof)
dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max = np.percentile(data, [0,25,50,75,100])
dist_skew = skew(data)
dist_kurtosis = kurtosis(data)
return (data, dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)
return (dist_mean, dist_stdev, dist_skew, dist_kurtosis, dist_min, dist_quartile1, dist_quartile2, dist_quartile3, dist_max)

def get_numeric_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "NUMERIC"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.NUMERIC]

def get_categorical_features(dataframe, column_types):
return [feature for feature in dataframe.columns if column_types[feature] == "CATEGORICAL"]
return [feature for feature in dataframe.columns if column_types[feature] == consts.CATEGORICAL]

def dtype_is_numeric(dtype):
return "int" in str(dtype) or "float" in str(dtype)
27 changes: 27 additions & 0 deletions metalearn/metafeatures/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from enum import Enum

# Constant Enums
class ProblemType(Enum):
CLASSIFICATION = "classification"
REGRESSION = "regression"
ANY = "any"

class MetafeatureGroup(Enum):
ALL = "all"
SIMPLE = "simple"
TEXT = "text"
STATISTICAL = "statistical"
INFO_THEORETIC = "info_theoretic"
LANDMARKING = "landmarking"
MODEL_BASED = "model_based"
TARGET_DEPENDENT = "target_dependent"

# Constant strings
VALUE_KEY = 'value'
COMPUTE_TIME_KEY = 'compute_time'
NUMERIC = "NUMERIC"
epeters3 marked this conversation as resolved.
Show resolved Hide resolved
TEXT = "TEXT"
CATEGORICAL = "CATEGORICAL"
NO_TARGETS = "NO_TARGETS"
NUMERIC_TARGETS = "NUMERIC_TARGETS"
TIMEOUT = "TIMEOUT"
Loading